代码注释比较详细,直接上代码:(数据在最后附录)
- library(rpart)
- library(rpart.plot)
- #第一个决策树
- data<-read.table(file.choose(),header = T)
- data$V35<-factor(data$V35,levels = 1:6,
- labels = c("牛皮癣","SE皮炎",
- "扁平苔癣","玫瑰癣","国外皮炎","毛发红癣"))#转化为因子
- set.seed(1)#设定种子器
- train<-sample(nrow(data),0.7*nrow(data))#抽取训练集和测试集
- data.train<-data[train,]#训练集
- data.validata<-data[-train,]#测试集
- dtree<-rpart(V35~.,data = data.train,
- method = "class",
- parms=list(split="gini"))
- rpart.plot(dtree,type=2,extra=102,
- shadow.col = "gray",box.col="green",
- border.col="blue",split.col="red",
- split.cex=1,main="决策树")
- dtree.pred<-predict(dtree,data.validata,
- type="class")#预测
- dtree.perf<-table(data.validata$V35,dtree.pred)
- dtree.perf#评估预测的准确性
- dtree$cptable
- plotcp(dtree)#复杂度参数cp的描述
- dtree.pruned<-prune(dtree,cp=0.08)#剪枝
- rpart.plot(dtree.pruned,type=2,extra=102,
- shadow.col = "gray",box.col="green",
- border.col="blue",split.col="red",
- split.cex=1,main="决策树")
- dtree.pred<-predict(dtree.pruned,data.validata,
- type="class")#再进行预测
- dtree.perf<-table(data.validata$V35,dtree.pred,
- dnn=c("actual","predicted"))
- dtree.perf#评估预测的准确性
例:
- x<-1:1000
- set.seed(1)
- sample(x,10)
- x<-1:1000
- sample(x,10)
- #第二个决策树
- data1<-read.csv(file.choose(),header = F)
- names(data1)<-c("class","age","gender","state")
- #给数据集data的列命名
- set.seed(1)
- train1<-sample(nrow(data1),0.7*nrow(data1))
- data1.train1<-data1[train1,]
- data1.validate1<-data1[-train1,]
- dtree1<-rpart(state~gender+age+class,
- data=data1,method = "class",
- parms = list(split="information"))
- rpart.plot(dtree1,type=2,extra=102,
- shadow.col = "gray",box.col="green",
- border.col="blue",split.col="red",
- split.cex=1,main="决策树")
- dtree1.pred1<-predict(dtree1,
- data1.validate1,
- type="class")
- dtree1.perf1<-table(data1.validate1$state,
- dtree1.pred1,
- dnn=c("actual","predict"))
- dtree1.perf1
- dtree1$cptable
- plotcp(dtree1)
- dtree1.pruned<-prune(dtree1,cp=0.03)
- rpart.plot(dtree1.pruned,type=2,extra=102,
- shadow.col = "gray",box.col="green",
- border.col="blue",split.col="red",
- split.cex=1,main="决策树")
- dtree1.pred1<-predict(dtree1.pruned,
- data1.validate1,
- type="class")
- dtree1.perf1<-table(data1.validate1$state,
- dtree1.pred1,
- dnn=c("actual","predict"))
- dtree1.perf1
- #第三个决策树
- set.seed(12)
- train2<-sample(nrow(kyphosis),
- 0.7*nrow(kyphosis))
- kyphosis.train<-kyphosis[train,]
- kyphosis.validate<-kyphosis[-train,]
- dtree2<-rpart(Kyphosis~Age+Number+Start,
- data=kyphosis,method = "class",
- parms = list(split="information"))
- rpart.plot(dtree2,type=2,extra=2,
- shadow.col = "gray",box.col="green",
- border.col="blue",split.col="red",
- split.cex=1,main="决策树")
- dtree2.pred2<-predict(dtree2,kyphosis.validate,
- type="class")
- dtree2.perf2<-table(kyphosis.validate$Kyphosis,
- dtree2.pred2)
- dtree2.perf2
- dtree2$cptable
- plotcp(dtree2)
- #不需要再剪枝
附录:第一个决策树的数据
Dermatology1.txt
(25.85 KB, 需要: 5 个论坛币)
第二个决策树的数据
Titanic.xls
(64.96 KB, 需要: 5 个论坛币)
第三个决策树的数据是R包中自带的数据kyphosis
有疑问可以留言。。。


雷达卡



京公网安备 11010802022788号







