- 阅读权限
- 255
- 威望
- 0 级
- 论坛币
- 50288 个
- 通用积分
- 83.6306
- 学术水平
- 253 点
- 热心指数
- 300 点
- 信用等级
- 208 点
- 经验
- 41518 点
- 帖子
- 3256
- 精华
- 14
- 在线时间
- 766 小时
- 注册时间
- 2006-5-4
- 最后登录
- 2022-11-6
|
- ######################
- #lecture 25 - Random Forest, boosting & bagging
- require(rpart)
- require(randomForest)
- require(ada)
- #reading in data
- data = read.csv("spam_dat.csv",header=FALSE)
- ndat = read.delim("spam_vars.txt",header=FALSE)
- #parsing variable names
- nams = NULL
- for(i in 1:nrow(ndat))
- {
- vec = strsplit(as.character(ndat[i,]),split="_")
- for(j in 1:length(vec[[1]]))
- {
- if(length(grep(":",vec[[1]][j]))>0)
- {
- vars = strsplit(vec[[1]][j],split=":")
- nams = c(nams,vars[[1]][1])
- }
- }
- }
- Y = data[,58]
- n = length(Y)
- sum(Y)/n
- X = as.matrix(log(1 + data[,1:57]))
- colnames(X) = nams
- X = scale(X)/sqrt(n-1)
- ind = sample(1:nrow(X),floor(nrow(X)*.3))
- xts = X[ind,]; xtr = X[-ind,]
- yts = Y[ind]; ytr = Y[-ind]
- trfdat = data.frame(as.factor(ytr),xtr)
- names(trfdat)[1] = "ytr"
- trdat = data.frame(ytr,xtr)
- tsdat = data.frame(yts,xts)
- #trees
- fit1 = rpart(ytr~.,data=trdat,method="class",xval=5,cp=.01)
- trpred = apply(predict(fit1),1,which.max) - 1
- tree.trerr = sum(abs(ytr - trpred))/length(ytr)
- tspred = apply(predict(fit1,newdata=data.frame(xts)),1,which.max) - 1
- tree.tserr = sum(abs(yts - tspred))/length(yts)
- tree.err = c(tree.trerr, tree.tserr)
- tree.err
- #logistic regression
- fit2 = glm(ytr~.,data=trdat,family="binomial")
- trpred = (sign(predict(fit2)) + 1)/2
- logit.trerr = sum(abs(ytr - trpred))/length(ytr)
- tspred = (sign(predict(fit2,newdata=data.frame(xts))) + 1)/2
- logit.tserr = sum(abs(yts - tspred))/length(yts)
- logit.err = c(logit.trerr, logit.tserr)
- logit.err
- #######boosting
- #adaboost
- fit3 = ada(x=xtr,y=ytr,loss="exponential",type="discrete",iter=100)
- ab.trerr = (fit3$confusion[2,1] + fit3$confusion[1,2])/length(ytr)
- plot(fit3)
- tspred = round(predict(fit3,newdata=data.frame(xts),type="probs"))
- ab.tserr = sum(abs(yts - (apply(tspred,1,which.max)-1)))/length(yts)
- ab.err = c(ab.trerr, ab.tserr)
- ab.err
- #logitboost
- fit4 = ada(x=xtr,y=ytr,loss="logistic",type="real",iter=100)
- lb.trerr = (fit4$confusion[2,1] + fit4$confusion[1,2])/length(ytr)
- plot(fit4)
- tspred = round(predict(fit4,newdata=data.frame(xts),type="probs"))
- lb.tserr = sum(abs(yts - (apply(tspred,1,which.max)-1)))/length(yts)
- lb.err = c(lb.trerr, lb.tserr)
- lb.err
- errmat = cbind(tree.err,logit.err,ab.err,lb.err)
- errmat
- #bagging
- fit5 = randomForest(ytr~.,mtry=57,data=trfdat,xtest=data.frame(xts),ytest=as.factor(yts))
- print(fit5)
- bag.err = c(sum(abs(ytr-(as.numeric(fit5$predicted)-1))/length(ytr)),(fit5$err.rate[500,2]*(length(yts)-sum(yts))+fit5$err.rate[500,3]*sum(yts))/length(yts))
- bag.err
- #random forest
- fit6 = randomForest(ytr~.,mtry=4,data=trfdat,xtest=data.frame(xts),ytest=as.factor(yts))
- print(fit6)
- rf.err = c(sum(abs(ytr-(as.numeric(fit6$predicted)-1))/length(ytr)),(fit6$err.rate[500,2]*(length(yts)-sum(yts))+fit6$err.rate[500,3]*sum(yts))/length(yts))
- rf.err
- errmat = cbind(tree.err,logit.err,ab.err,lb.err,bag.err,rf.err)
- errmat
复制代码
|
|