续上一篇写的.https://bbs.pinggu.org/thread-5993232-1-1.html
网格化参数寻优随机森林
在准备好数据之后,我们大部分时间是花在参数调整和寻优上,下面提供了网格化参数寻优的方式,R提供才caret包支持主要的机器学习算法进行参数寻优:k-近邻(knn)、朴素贝叶斯(nb)、决策树(c5.0)、规则学习(oneR、RIPPER)、线性回归(lm,无参数)、回归树(rpart)、回归树(M5)、神经网络(nnet)、支持向量机(svmLinear、svmRadial)、随机森林(rf)。
- # -------------------------------- 自动调整参数randomForest --------------------------------------
- # smote抽样
prop.table(table(train_smote_set$FRAUD_FLAG))
n1=nrow(train_set[train_set$FRAUD_FLAG==0,])
train_no <- subset(train_set, FRAUD_FLAG==0)
train_no_runif <- train_no[order(runif(n1)),]
train_smote_set3 <- rbind(subset(train_smote_set, FRAUD_FLAG==1) ,train_no_runif[1:(0.2*n1),])
table(train_smote_set3$FRAUD_FLAG)
prop.table(table(train_smote_set3$FRAUD_FLAG))
# smote抽样
set.seed(500)
TrainingSet_smote <- SMOTE(FRAUD_FLAG~., data=train_set, k=10, perc.over = 100, perc.under = 115)
table(TrainingSet_smote$FRAUD_FLAG)
prop.table(table(TrainingSet_smote$FRAUD_FLAG))
#统计运行时间运行
system.time({
# 运行程序对象
# 对比随机森林与BoostingC5.0模型
# 重复十折交叉验证,method默认是“cv”交叉验证,number表示折的数目,repeats表示迭代次数
ctrl <- trainControl(method="repeatedcv", number=10, repeats = 1)
# 对随机森林的特征选择参数mtry进行网格调整,表示分别尝试特征选择数mtry为12、24、48、96的randomForest
grid_rf <- expand.grid(.mtry=c(1,2,3))
set.seed(300)
# 使用kappa值选择最佳模型,Kappa取值为-1~1,越接近1表示预测效果越好,0表示随机预测,-1表示预测结果相反,
# 可用于非均衡数据
auto_rf <- train(FRAUD_FLAG ~., data=train_smote_set3, method="rf",
metric = "Kappa", trControl = ctrl, tuneGrid = grid_rf)
auto_rf
});
# Random Forest
#
# 1896 samples
# 34 predictor
# 2 classes: '0', '1'
#
# 给定预测数据集:test_set、train_set、fraud_date_out_step_imp_var、fraud_date_out_enet_imp_var、fraud_imp_var
set_in <- fraud_date_out_step_imp_var
# test_set测试集,必须与训练集一致
# type可以是"response","prob","vote",分别表示输出预测向量是预测类别、预测概率或投票矩阵
fraud_pred <- predict(auto_rf, set_in, type='prob')
c5.0决策树
- # -------------------------------- C5.0决策树 -----------------------------------------------
- library(C50)
dnn = c('ACTUAL FRAUD','PREDICTED FRAUD'))
自动调参的C5.0决策树
- # ----------------------------- boosting提升法自动调参的C5.0决策树 --------------------------------------
- library(C50)
# trails为boosting迭代次数,10是最常用的值,一般可减少25%的错误
grid_c50 <- expand.grid(.trials=c(10,20,30,40),
.model="tree",
.winnow="TRUE")
# 10折交叉验证
ctrl <- trainControl(method="cv", number=10)
set.seed(300)
m_c50 <- train(FRAUD_FLAG ~., data=TrainingSet_smote, method="C5.0",
metric = "Kappa", trControl = ctrl, tuneGrid = grid_c50)
fraud_mc50_smote <- predict(m_c50,TestSet)
CrossTable(TestSet$FRAUD_FLAG,fraud_mc50_smote,
prop.chisq = FALSE, prop.c = FALSE,prop.r = FALSE,
dnn = c('ACTUAL FRAUD','PREDICTED FRAUD'))
GBDT
- # -------------------------------- GBDT -----------------------------------------------
# cv.folds交叉验证折数
# n.cores使用CPU核数
fraud_gbdt <- gbm(FRAUD_FLAG ~ ., data = TrainingSet_smote,
distribution = 'bernoulli',
n.trees = 5,
interaction.depth = 5,
shrinkage = 0.01,
bag.fraction = 0.5,
cv.folds = 5,
n.cores = 3)
# 用交叉验证确定最佳迭代次数
best.iter <- gbm.perf(fraud_gbdt,method = 'cv')
best.iter
# 分析变量的重要性
summary(fraud_gbdt,n.trees = best.iter)
# 绘制变量的边际图,即控制其他变量不变时,该变量对响应变量的影响程度
plot(fraud_gbdt,1,best.iter)
plot(fraud_gbdt,3,best.iter)
# 用caret包训练模型
library(caret)
# 设置训练参数
fitControl <- trainControl(method = "cv", number = 10,returnResamp = "all")
fraud_gbdt_new <- train(FRAUD_FLAG~., data=TrainingSet_smote,method='gbm',distribution='bernoulli'
,trControl = fitControl,verbose=F
,tuneGrid = data.frame(.n.trees=1000,
.shrinkage=0.05,
.interaction.depth=5,
.n.minobsinnode=10))
fraud_gbdt_new
# 用测试集预测
fraud_gbdt_new_pred <- predict(fraud_gbdt_new, test_set, type='raw')
# 输出混淆矩阵
CrossTable(test_set$FRAUD_FLAG,fraud_gbdt_new_pred,
prop.chisq = FALSE, prop.c = FALSE,prop.r = T,
dnn = c('ACTUAL FRAUD','PREDICTED FRAUD'))
xgboost
- # -------------------------------- xgboost -----------------------------------------------
missing = -10000)
xgb_train_label <- as.matrix(TrainingSet_smote$FRAUD_FLAG)
xgb_test_data <- data.matrix(subset(test_set,select=-FRAUD_FLAG))
xgb_test_label <- test_set$FRAUD_FLAG
p=nrow(TrainingSet_smote[TrainingSet_smote$FRAUD_FLAG==1,])
n=nrow(TrainingSet_smote[TrainingSet_smote$FRAUD_FLAG==0,])
weight=1*n/p
param <- list("objective" = "binary:logistic",
"booster"="gbtree",
"gamma"=0,
"lambda"=700,
"subsample"=0.5,
"colsample_bytreee"=0.3,
"scale_pos_weight" = weight,
"bst:eta" = c(0.01,0.02,0.05,0.1,0.2,0.5,1),"bst:max_depth" = c(20,15,10,5,3),
"eval_metric" = "auc","silent" = 1,
"nthread" = 4 ,"min_child_weight" =c(1,1.5,2,3,5,8,10))
nround =1000
xgb_model = xgb.train(param, xgb_train, nround )
xgb_model
# 给定预测数据集:test_set、TrainingSet_smote、train_set、fraud_date_in_step_imp_var、fraud_date_out_step_imp_var
set_in <- as.matrix(subset(test_set,select=-FRAUD_FLAG))
set_in_label <- test_set$FRAUD_FLAG
# test_set测试集,必须与训练集一致
# type可以是"response","prob","vote",分别表示输出预测向量是预测类别、预测概率或投票矩阵
fraud_pred <- predict(xgb_model, set_in, type='prob')
自助法SVM
- # -------------------------------- bagging自助法SVM --------------------------------------
- # library(kernlab)
下面,我谈谈应用这些模型算法的经验。我是在一个车贷金融里做建模的,最近在做反欺诈,通过最近这些模型的应用,发现有句话真的非常经典:数据和特征决定了效果上限,模型和算法决定了逼近这个上限的程度。我认为做这一块都需要好好理解这一句话,其实我做到后面发现,当你的数据质量和特征欠佳时,会严重影响模型的效果,算法和模型有时候反而差别不大。
以上纯属个人观点,欢迎讨论交流!