library(rdwd)
library(kernlab)
data('ticdata')
#变量中韩有非标准字符,如”%,“,等等
recodeLevels <- function(x){
x <- as.numeric(x)
##添加零到文本版本
x <- gsub(" ","0",format(as.numeric(x)))
factor(x)
}
#找出那些有常规因子以及有序因子的列
isOrdered <- unlist(lapply(ticdata,is.ordered))
isFactor <- unlist(lapply(ticdata,is.factor))
convertCols <- names(isOrdered)[isOrdered|isFactor]
for(i in convertCols)ticdata[,i] <- recodeLevels(ticdata[,i])
#将层级‘insurance’设置为第一因子层级
ticdata$CARAVAN <- factor(as.character(ticdata$CARAVAN),
levels = rev(levels(ticdata$CARAVAN)))
#通过分层随机抽样得到训练集和测试集
library(caret)
set.seed(156)
split1 <- createDataPartition(ticdata$CARAVAN,p =.7)[[1]]
other <- ticdata[-split1,]
training <- ticdata[split1,]
#创建评估集合和测试集
set.seed(2234)
split2 <- createDataPartition(other$CARAVAN,p = 1/3)[[1]]
evaluation <- other[split2,]
testing <- other[-split2,]
##决定预测变量的名字
predictors <- names(training)[names(training)!= 'CARAVAN']
##第一列为截距,我们将其删除
trainingInd <- data.frame(model.matrix(CARAVAN~.,data = training))[,-1]
evaluationInd <- data.frame(model.matrix(CARAVAN~.,data = evaluation))[,-1]
testingInd <- data.frame(model.matrix(CARAVAN~.,data = testing))[,-1]
##将结果变量放回数据集合
trainingInd$CARAVAN <- training$CARAVAN
evaluationInd$CARAVAN <- evaluation$CARAVAN
testingInd$CARAVAN <- testing$CARAVAN
##去除高度稀疏以及分布不平衡的变量
isNZV <- nearZeroVar(trainingInd)
noNZVSet <- names(trainingInd)[-isNZV]
#我们写了两个封装函数用于得到不同的模型变现测量:
##对于准确度、kappa、Roc线下面积、敏感度和特异度:
fiveStats <- function(...)c(twoClassSummary(...),
defaultSummary(...))
##不包含ROC线下面积
fourStats <- function(data,lev = levels(data$obs),model = NULL){
accKapp <- postResample(data[,'pred'],data[,'obs'])
out <- c(accKapp,
sensitivity(data[,'pred'],data[,'obs'],lev[1]),
specificity(data[,'pred'],data[,'obs'],lev[2]))
names(out)[3:4] <- c('Sens','Spec')
out
}
#建立两个控制函数分别对应如下两种情况:能计算出类概率和不能计算出类概率
ctrl <- trainControl(method = 'cv',
classProbs = TRUE,
summaryFunction = fiveStats,
verboseIter = TRUE)
ctrlNoProb <- ctrl
ctrlNoProb$summaryFunction <- fourStats
ctrlNoProb$classProbs <- FALSE
#在该与法向拟合三个基准模型
set.seed(1410)
rfFit <- train(CARAVAN~.,data = trainingInd,
method = 'rf',
trControl = ctrl,
ntree = 1500,
tuneLength =5,
metric = 'ROC')
Error: At least one of the class levels is not a valid R variable name; This will cause errors when class probabilities are generated because the variables names will be converted to X2, X1 . Please use factor levels that can be used as valid R variable names (see ?make.names for help).