已经确定使用随机森林模型进行预测,大概问题是有一些自定义设置的利率及天数,想要输入想设置的利率范围或天数输出该产品售空天数Y,写的程序如下,但是自己设置的变量只可以输入两个,并且最后输出的售空时间结果都是相同的,请各位大神指导。
import pandas as pd
data = pd.read_csv('C:\\Users\\huachen.mu\\Desktop\\fea.csv')
import numpy as np
import pandas as pd
para_name = ['INC_RATE','CMSC_RATE']###只能两个变量???
para_range = [ '0.05:0.08','0.02:0.08']
pre_scale_list = ['PRD_TERM', 'BAS_DAYS', 'PFST_AMT', 'OFST_AMT', 'DUE_INC_RAT', 'EST_AMT', 'INC_RATE', 'CMSC_RATE',
'CTD_RATE','AGIO_FEE','CTD_RATE','DFT_INC_RAT','CTD_RAT']
one_hot_feature = ['PRD_TYP', 'CUS_CLS', 'CCY', 'INC_TYP', 'CAN_FLG', 'PRD_ATTR', 'RSK_LVL', 'ST_FLG', 'BGT_CLS',
'TX_CNL', 'DEP_TYP', 'HX_PRD_CD','EST_FLG']
para_df = pd.DataFrame(para_range).T
para_df.columns = para_name
for i in para_name:
if i in pre_scale_list:
for j in para_df[i]:
a = j.split(':')
a = list(map(float, a))
if len(a) > 1:
start = a[0]
end = a[1]
b= np.arange(start, end, 0.01)
b= ','.join('%s'% i for i in b)
para_df[i] =b
else:
para_df[i] = a
if i in one_hot_feature:
for j in para_df[i]:
a = j.split(':')
a= ','.join(a)
para_df[i] = a
for i in para_name:
para_a = para_df[i].str.split(',', expand=True).stack().reset_index(level=1,drop=True).rename(i)
para_b = para_df.drop(i,axis=1)
para_c = para_b.join(para_a)
para_df = para_c
data_DUE_INC_RAT = []
for i in data['DUE_INC_RAT']:
a = str(i).split('-', 1)
if len(a) == 1:
data_DUE_INC_RAT.append(a[0])
else:
data_DUE_INC_RAT.append(a[1])
data['DUE_INC_RAT'] = data_DUE_INC_RAT
data['DUE_INC_RAT'] = data['DUE_INC_RAT'].astype('float64')
print('收益率处理完成')
data['DEP_TYP'].fillna(-1, inplace=True)
data['HX_PRD_CD'].fillna(-1, inplace=True)
data['INC_TYP'] = np.where(data['INC_TYP'].isnull(), data['RSK_LVL'], data['INC_TYP'])
print('缺失值处理完毕')
# pre_scale_list = ['PRD_TERM', 'BAS_DAYS', 'PFST_AMT', 'OFST_AMT', 'DUE_INC_RAT', 'EST_AMT', 'INC_RATE', 'CMSC_RATE',
# 'CTD_RATE','AGIO_FEE','CTD_RATE','DFT_INC_RAT','CTD_RAT']
# one_hot_feature = ['PRD_TYP', 'CUS_CLS', 'CCY', 'INC_TYP', 'CAN_FLG', 'PRD_ATTR', 'RSK_LVL', 'ST_FLG', 'BGT_CLS',
# 'TX_CNL', 'DEP_TYP', 'HX_PRD_CD','EST_FLG']
for i in one_hot_feature:
if i not in para_name:
para_df[i] = max(list(data[i]),key = list(data[i]).count)
for i in pre_scale_list:
if i not in para_name:
para_df[i] = np.median(list(data[i]))
# print(para_df.columns)
para_df['brand'] = 'test'
data['brand']= 'train'
# print(para_df)
data_new = pd. concat([data,para_df],axis =0)
data_new.isnull().any()
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
def read_feature(data):
for i in one_hot_feature:
print(i)
try:
data[i] = LabelEncoder().fit_transform(data[i].apply(int))
except:
data[i] = LabelEncoder().fit_transform(data[i].apply(str))
print(i+'finish')
return data
data = read_feature(data_new)
data_list = data.columns.to_list()
data_list.remove('PRD_CD')
data_list.remove('TIMES_DIFF')
data_list.remove('brand')
del_list=[]
for i in data_list:
data[i] =data[i].astype('float')
Dx = np.std(data[i])
# print(Dx)
if Dx ==0:
del_list.append(i)
data.pop(i)
data_list = data.columns.to_list()
data_list.remove('PRD_CD')
data_list.remove('TIMES_DIFF')
# print(del_list)
for i in del_list:
if i in one_hot_feature:
one_hot_feature.remove(i)
for i in del_list:
if i in pre_scale_list:
pre_scale_list.remove(i)
# print(data)
from sklearn import preprocessing
max_scaleer = preprocessing.MinMaxScaler()
data_sca = data[pre_scale_list]
data_sca = max_scaleer.fit_transform(data_sca)
data_sca=pd.DataFrame(data_sca)
data_sca.columns=pre_scale_list
data[pre_scale_list] =data_sca
data_percentile95 = np.percentile(data['TIMES_DIFF'],95)
data_zhouqi = (data['PRD_TERM'])* 24 *60 *60
data[data['TIMES_DIFF']>data_percentile95 ] = -1
data[data['TIMES_DIFF']> data_zhouqi ] =-1
train = data[data['brand'] == 'train']
test = data[data['brand'] == 'test']
train_label = train.pop('TIMES_DIFF')
test_label = test.pop('TIMES_DIFF')
train_x=train[one_hot_feature]
test_x=test[one_hot_feature]
# print( one_hot_feature)
from scipy import sparse
enc = OneHotEncoder(categories='auto')
for i in one_hot_feature:
enc.fit(data[i].values.reshape(-1, 1))
train_a=enc.transform(train[i].values.reshape(-1, 1))
test_a = enc.transform(test[i].values.reshape(-1, 1))
train_x= sparse.hstack((train_x, train_a))
test_x = sparse.hstack((test_x, test_a))
# print(i+' finish')
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
import xgboost
rfr = RandomForestRegressor(n_estimators=20)
rfr.fit(train_x,train_label)
rfr_predict = rfr.predict(test_x)
print(rfr_predict)
para_df['TIMES_DIFF'] = list(rfr_predict)
min_times = para_df['TIMES_DIFF'].min()
print(min_times)
para_df = para_df[para_df['TIMES_DIFF']!= -1]
# fast_celve = para_df[para_df['TIMES_DIFF']==para_df['TIMES_DIFF'].min()]
# print(para_df['TIMES_DIFF'])
# print(fast_celve)
print(para_df)