楼主: Lisrelchen
5378 41

[Lecture Notes]Genevera Allen:Statistical Learning using R and Matlab [推广有奖]

21
Lisrelchen(未真实交易用户) 发表于 2016-5-29 22:34:44
  1. ######################
  2. #lecture 24 - boosting & bagging

  3. require(rpart)
  4. require(randomForest)


  5. #reading in data
  6. data = read.csv("spam_dat.csv",header=FALSE)
  7. ndat = read.delim("spam_vars.txt",header=FALSE)

  8. #parsing variable names
  9. nams = NULL
  10. for(i in 1:nrow(ndat))
  11. {
  12.   vec = strsplit(as.character(ndat[i,]),split="_")
  13.   for(j in 1:length(vec[[1]]))
  14.     {
  15.       if(length(grep(":",vec[[1]][j]))>0)
  16.         {
  17.           vars = strsplit(vec[[1]][j],split=":")
  18.           nams = c(nams,vars[[1]][1])
  19.         }
  20.     }
  21. }

  22. Y = data[,58]
  23. n = length(Y)
  24. sum(Y)/n
  25. X = as.matrix(log(1 + data[,1:57]))
  26. colnames(X) = nams
  27. X = scale(X)/sqrt(n-1)
  28. ind = sample(1:nrow(X),floor(nrow(X)*.3))
  29. xts = X[ind,]; xtr = X[-ind,]
  30. yts = Y[ind]; ytr = Y[-ind]
  31. trfdat = data.frame(as.factor(ytr),xtr)
  32. names(trfdat)[1] = "ytr"
  33. trdat = data.frame(ytr,xtr)
  34. tsdat = data.frame(yts,xts)


  35. #trees
  36. fit1 = rpart(ytr~.,data=trdat,method="class",xval=5,cp=.01)
  37. trpred = apply(predict(fit1),1,which.max) - 1
  38. tree.trerr = sum(abs(ytr - trpred))/length(ytr)
  39. tspred = apply(predict(fit1,newdata=data.frame(xts)),1,which.max) - 1
  40. tree.tserr = sum(abs(yts - tspred))/length(yts)
  41. tree.err = c(tree.trerr, tree.tserr)
  42. tree.err


  43. #logistic regression
  44. fit2 = glm(ytr~.,data=trdat,family="binomial")
  45. trpred = (sign(predict(fit2)) + 1)/2
  46. logit.trerr = sum(abs(ytr - trpred))/length(ytr)
  47. tspred = (sign(predict(fit2,newdata=data.frame(xts))) + 1)/2
  48. logit.tserr = sum(abs(yts - tspred))/length(yts)
  49. logit.err = c(logit.trerr, logit.tserr)
  50. logit.err


  51. #######boosting
  52. require(ada)

  53. #adaboost
  54. fit3 = ada(x=xtr,y=ytr,loss="exponential",type="discrete",iter=150)
  55. ab.trerr = (fit3$confusion[2,1] + fit3$confusion[1,2])/length(ytr)
  56. plot(fit3)
  57. tspred = round(predict(fit3,newdata=data.frame(xts),type="probs"))
  58. ab.tserr = sum(abs(yts - (apply(tspred,1,which.max)-1)))/length(yts)
  59. ab.err = c(ab.trerr, ab.tserr)
  60. ab.err

  61. #logitboost
  62. fit4 = ada(x=xtr,y=ytr,loss="logistic",type="real",iter=150)
  63. lb.trerr = (fit4$confusion[2,1] + fit4$confusion[1,2])/length(ytr)
  64. plot(fit4)
  65. tspred = round(predict(fit4,newdata=data.frame(xts),type="probs"))
  66. lb.tserr = sum(abs(yts - (apply(tspred,1,which.max)-1)))/length(yts)
  67. lb.err = c(lb.trerr, lb.tserr)
  68. lb.err



  69. #bagging
  70. fit5 = randomForest(ytr~.,mtry=57,data=trfdat,xtest=data.frame(xts),ytest=as.factor(yts))
  71. print(fit5)
  72. bag.err = c(sum(abs(ytr-(as.numeric(fit5$predicted)-1))/length(ytr)),(fit5$err.rate[500,2]*(length(yts)-sum(yts))+fit5$err.rate[500,3]*sum(yts))/length(yts))
  73. bag.err


  74. errmat = cbind(tree.err,logit.err,ab.err,lb.err,bag.err)
  75. errmat
复制代码

22
Lisrelchen(未真实交易用户) 发表于 2016-5-29 22:35:23
  1. ######################
  2. #lecture 25 - Random Forest, boosting & bagging

  3. require(rpart)
  4. require(randomForest)
  5. require(ada)

  6. #reading in data
  7. data = read.csv("spam_dat.csv",header=FALSE)
  8. ndat = read.delim("spam_vars.txt",header=FALSE)

  9. #parsing variable names
  10. nams = NULL
  11. for(i in 1:nrow(ndat))
  12. {
  13.   vec = strsplit(as.character(ndat[i,]),split="_")
  14.   for(j in 1:length(vec[[1]]))
  15.     {
  16.       if(length(grep(":",vec[[1]][j]))>0)
  17.         {
  18.           vars = strsplit(vec[[1]][j],split=":")
  19.           nams = c(nams,vars[[1]][1])
  20.         }
  21.     }
  22. }

  23. Y = data[,58]
  24. n = length(Y)
  25. sum(Y)/n
  26. X = as.matrix(log(1 + data[,1:57]))
  27. colnames(X) = nams
  28. X = scale(X)/sqrt(n-1)
  29. ind = sample(1:nrow(X),floor(nrow(X)*.3))
  30. xts = X[ind,]; xtr = X[-ind,]
  31. yts = Y[ind]; ytr = Y[-ind]
  32. trfdat = data.frame(as.factor(ytr),xtr)
  33. names(trfdat)[1] = "ytr"
  34. trdat = data.frame(ytr,xtr)
  35. tsdat = data.frame(yts,xts)


  36. #trees
  37. fit1 = rpart(ytr~.,data=trdat,method="class",xval=5,cp=.01)
  38. trpred = apply(predict(fit1),1,which.max) - 1
  39. tree.trerr = sum(abs(ytr - trpred))/length(ytr)
  40. tspred = apply(predict(fit1,newdata=data.frame(xts)),1,which.max) - 1
  41. tree.tserr = sum(abs(yts - tspred))/length(yts)
  42. tree.err = c(tree.trerr, tree.tserr)
  43. tree.err


  44. #logistic regression
  45. fit2 = glm(ytr~.,data=trdat,family="binomial")
  46. trpred = (sign(predict(fit2)) + 1)/2
  47. logit.trerr = sum(abs(ytr - trpred))/length(ytr)
  48. tspred = (sign(predict(fit2,newdata=data.frame(xts))) + 1)/2
  49. logit.tserr = sum(abs(yts - tspred))/length(yts)
  50. logit.err = c(logit.trerr, logit.tserr)
  51. logit.err


  52. #######boosting

  53. #adaboost
  54. fit3 = ada(x=xtr,y=ytr,loss="exponential",type="discrete",iter=100)
  55. ab.trerr = (fit3$confusion[2,1] + fit3$confusion[1,2])/length(ytr)
  56. plot(fit3)
  57. tspred = round(predict(fit3,newdata=data.frame(xts),type="probs"))
  58. ab.tserr = sum(abs(yts - (apply(tspred,1,which.max)-1)))/length(yts)
  59. ab.err = c(ab.trerr, ab.tserr)
  60. ab.err

  61. #logitboost
  62. fit4 = ada(x=xtr,y=ytr,loss="logistic",type="real",iter=100)
  63. lb.trerr = (fit4$confusion[2,1] + fit4$confusion[1,2])/length(ytr)
  64. plot(fit4)
  65. tspred = round(predict(fit4,newdata=data.frame(xts),type="probs"))
  66. lb.tserr = sum(abs(yts - (apply(tspred,1,which.max)-1)))/length(yts)
  67. lb.err = c(lb.trerr, lb.tserr)
  68. lb.err

  69. errmat = cbind(tree.err,logit.err,ab.err,lb.err)
  70. errmat


  71. #bagging
  72. fit5 = randomForest(ytr~.,mtry=57,data=trfdat,xtest=data.frame(xts),ytest=as.factor(yts))
  73. print(fit5)
  74. bag.err = c(sum(abs(ytr-(as.numeric(fit5$predicted)-1))/length(ytr)),(fit5$err.rate[500,2]*(length(yts)-sum(yts))+fit5$err.rate[500,3]*sum(yts))/length(yts))
  75. bag.err


  76. #random forest
  77. fit6 = randomForest(ytr~.,mtry=4,data=trfdat,xtest=data.frame(xts),ytest=as.factor(yts))
  78. print(fit6)
  79. rf.err = c(sum(abs(ytr-(as.numeric(fit6$predicted)-1))/length(ytr)),(fit6$err.rate[500,2]*(length(yts)-sum(yts))+fit6$err.rate[500,3]*sum(yts))/length(yts))
  80. rf.err


  81. errmat = cbind(tree.err,logit.err,ab.err,lb.err,bag.err,rf.err)
  82. errmat
复制代码

23
Nicolle(未真实交易用户) 学生认证  发表于 2016-5-29 22:45:53
提示: 作者被禁止或删除 内容自动屏蔽

24
condmn(真实交易用户) 发表于 2016-5-30 02:21:49

25
jerker(真实交易用户) 发表于 2016-5-30 07:52:26
感谢楼主分享,辛苦了。

26
sqy(真实交易用户) 发表于 2016-5-30 08:50:59
ding!!!!!!!!!

27
nieqiang110(真实交易用户) 学生认证  发表于 2016-5-30 09:20:55
Genevera Allen:Statistical Learning using R and Matlab

28
auirzxp(未真实交易用户) 学生认证  发表于 2016-5-30 17:27:52
提示: 作者被禁止或删除 内容自动屏蔽

29
mike68097(真实交易用户) 发表于 2016-5-31 14:13:36

30
言良殳史(未真实交易用户) 在职认证  发表于 2016-6-1 21:33:48
感谢楼主分享

您需要登录后才可以回帖 登录 | 我要注册

本版微信群
加好友,备注jltj
拉您入交流群
GMT+8, 2026-1-1 09:38