楼主: ReneeBK
4266 11

[休闲其它] 【独家发布】An Introduction to Statistical Learning with Applications in R [推广有奖]

  • 1关注
  • 62粉丝

VIP

已卖:4897份资源

学术权威

14%

还不是VIP/贵宾

-

TA的文库  其他...

R资源总汇

Panel Data Analysis

Experimental Design

威望
1
论坛币
49635 个
通用积分
55.6937
学术水平
370 点
热心指数
273 点
信用等级
335 点
经验
57805 点
帖子
4005
精华
21
在线时间
582 小时
注册时间
2005-5-8
最后登录
2023-11-26

楼主
ReneeBK 发表于 2014-11-21 02:25:42 |AI写论文

+2 论坛币
k人 参与回答

经管之家送您一份

应届毕业生专属福利!

求职就业群
赵安豆老师微信:zhaoandou666

经管之家联合CDA

送您一个全额奖学金名额~ !

感谢您参与论坛问题回答

经管之家送您两个论坛币!

+2 论坛币

An Introduction to Statistical Learning

with Applications in R

Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani

http://www-bcf.usc.edu/~gareth/ISL/code.html

二维码

扫码加我 拉你入群

请注明:姓名-公司-职位

以便审核进群资格,未注明则拒绝

关键词:introduction Applications Application Statistical statistica Robert

已有 1 人评分经验 收起 理由
我的素质低 + 100 精彩帖子

总评分: 经验 + 100   查看全部评分

本帖被以下文库推荐

沙发
ReneeBK 发表于 2014-11-21 02:31:15
  1. # Chapter 2 Lab: Introduction to R

  2. # Basic Commands

  3. x <- c(1,3,2,5)
  4. x
  5. x = c(1,6,2)
  6. x
  7. y = c(1,4,3)
  8. length(x)
  9. length(y)
  10. x+y
  11. ls()
  12. rm(x,y)
  13. ls()
  14. rm(list=ls())
  15. ?matrix
  16. x=matrix(data=c(1,2,3,4), nrow=2, ncol=2)
  17. x
  18. x=matrix(c(1,2,3,4),2,2)
  19. matrix(c(1,2,3,4),2,2,byrow=TRUE)
  20. sqrt(x)
  21. x^2
  22. x=rnorm(50)
  23. y=x+rnorm(50,mean=50,sd=.1)
  24. cor(x,y)
  25. set.seed(1303)
  26. rnorm(50)
  27. set.seed(3)
  28. y=rnorm(100)
  29. mean(y)
  30. var(y)
  31. sqrt(var(y))
  32. sd(y)

  33. # Graphics

  34. x=rnorm(100)
  35. y=rnorm(100)
  36. plot(x,y)
  37. plot(x,y,xlab="this is the x-axis",ylab="this is the y-axis",main="Plot of X vs Y")
  38. pdf("Figure.pdf")
  39. plot(x,y,col="green")
  40. dev.off()
  41. x=seq(1,10)
  42. x
  43. x=1:10
  44. x
  45. x=seq(-pi,pi,length=50)
  46. y=x
  47. f=outer(x,y,function(x,y)cos(y)/(1+x^2))
  48. contour(x,y,f)
  49. contour(x,y,f,nlevels=45,add=T)
  50. fa=(f-t(f))/2
  51. contour(x,y,fa,nlevels=15)
  52. image(x,y,fa)
  53. persp(x,y,fa)
  54. persp(x,y,fa,theta=30)
  55. persp(x,y,fa,theta=30,phi=20)
  56. persp(x,y,fa,theta=30,phi=70)
  57. persp(x,y,fa,theta=30,phi=40)

  58. # Indexing Data

  59. A=matrix(1:16,4,4)
  60. A
  61. A[2,3]
  62. A[c(1,3),c(2,4)]
  63. A[1:3,2:4]
  64. A[1:2,]
  65. A[,1:2]
  66. A[1,]
  67. A[-c(1,3),]
  68. A[-c(1,3),-c(1,3,4)]
  69. dim(A)

  70. # Loading Data

  71. Auto=read.table("Auto.data")
  72. fix(Auto)
  73. Auto=read.table("Auto.data",header=T,na.strings="?")
  74. fix(Auto)
  75. Auto=read.csv("Auto.csv",header=T,na.strings="?")
  76. fix(Auto)
  77. dim(Auto)
  78. Auto[1:4,]
  79. Auto=na.omit(Auto)
  80. dim(Auto)
  81. names(Auto)

  82. # Additional Graphical and Numerical Summaries

  83. plot(cylinders, mpg)
  84. plot(Auto$cylinders, Auto$mpg)
  85. attach(Auto)
  86. plot(cylinders, mpg)
  87. cylinders=as.factor(cylinders)
  88. plot(cylinders, mpg)
  89. plot(cylinders, mpg, col="red")
  90. plot(cylinders, mpg, col="red", varwidth=T)
  91. plot(cylinders, mpg, col="red", varwidth=T,horizontal=T)
  92. plot(cylinders, mpg, col="red", varwidth=T, xlab="cylinders", ylab="MPG")
  93. hist(mpg)
  94. hist(mpg,col=2)
  95. hist(mpg,col=2,breaks=15)
  96. pairs(Auto)
  97. pairs(~ mpg + displacement + horsepower + weight + acceleration, Auto)
  98. plot(horsepower,mpg)
  99. identify(horsepower,mpg,name)
  100. summary(Auto)
  101. summary(mpg)
复制代码


藤椅
ReneeBK 发表于 2014-11-21 02:32:03
  1. # Chapter 3 Lab: Linear Regression

  2. library(MASS)
  3. library(ISLR)

  4. # Simple Linear Regression

  5. fix(Boston)
  6. names(Boston)
  7. lm.fit=lm(medv~lstat)
  8. lm.fit=lm(medv~lstat,data=Boston)
  9. attach(Boston)
  10. lm.fit=lm(medv~lstat)
  11. lm.fit
  12. summary(lm.fit)
  13. names(lm.fit)
  14. coef(lm.fit)
  15. confint(lm.fit)
  16. predict(lm.fit,data.frame(lstat=(c(5,10,15))), interval="confidence")
  17. predict(lm.fit,data.frame(lstat=(c(5,10,15))), interval="prediction")
  18. plot(lstat,medv)
  19. abline(lm.fit)
  20. abline(lm.fit,lwd=3)
  21. abline(lm.fit,lwd=3,col="red")
  22. plot(lstat,medv,col="red")
  23. plot(lstat,medv,pch=20)
  24. plot(lstat,medv,pch="+")
  25. plot(1:20,1:20,pch=1:20)
  26. par(mfrow=c(2,2))
  27. plot(lm.fit)
  28. plot(predict(lm.fit), residuals(lm.fit))
  29. plot(predict(lm.fit), rstudent(lm.fit))
  30. plot(hatvalues(lm.fit))
  31. which.max(hatvalues(lm.fit))

  32. # Multiple Linear Regression

  33. lm.fit=lm(medv~lstat+age,data=Boston)
  34. summary(lm.fit)
  35. lm.fit=lm(medv~.,data=Boston)
  36. summary(lm.fit)
  37. library(car)
  38. vif(lm.fit)
  39. lm.fit1=lm(medv~.-age,data=Boston)
  40. summary(lm.fit1)
  41. lm.fit1=update(lm.fit, ~.-age)

  42. # Interaction Terms

  43. summary(lm(medv~lstat*age,data=Boston))

  44. # Non-linear Transformations of the Predictors

  45. lm.fit2=lm(medv~lstat+I(lstat^2))
  46. summary(lm.fit2)
  47. lm.fit=lm(medv~lstat)
  48. anova(lm.fit,lm.fit2)
  49. par(mfrow=c(2,2))
  50. plot(lm.fit2)
  51. lm.fit5=lm(medv~poly(lstat,5))
  52. summary(lm.fit5)
  53. summary(lm(medv~log(rm),data=Boston))

  54. # Qualitative Predictors

  55. fix(Carseats)
  56. names(Carseats)
  57. lm.fit=lm(Sales~.+Income:Advertising+Price:Age,data=Carseats)
  58. summary(lm.fit)
  59. attach(Carseats)
  60. contrasts(ShelveLoc)

  61. # Writing Functions

  62. LoadLibraries
  63. LoadLibraries()
  64. LoadLibraries=function(){
  65. library(ISLR)
  66. library(MASS)
  67. print("The libraries have been loaded.")
  68. }
  69. LoadLibraries
  70. LoadLibraries()
复制代码


板凳
ReneeBK 发表于 2014-11-21 02:33:15
  1. # Chapter 4 Lab: Logistic Regression, LDA, QDA, and KNN

  2. # The Stock Market Data

  3. library(ISLR)
  4. names(Smarket)
  5. dim(Smarket)
  6. summary(Smarket)
  7. pairs(Smarket)
  8. cor(Smarket)
  9. cor(Smarket[,-9])
  10. attach(Smarket)
  11. plot(Volume)

  12. # Logistic Regression

  13. glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial)
  14. summary(glm.fit)
  15. coef(glm.fit)
  16. summary(glm.fit)$coef
  17. summary(glm.fit)$coef[,4]
  18. glm.probs=predict(glm.fit,type="response")
  19. glm.probs[1:10]
  20. contrasts(Direction)
  21. glm.pred=rep("Down",1250)
  22. glm.pred[glm.probs>.5]="Up"
  23. table(glm.pred,Direction)
  24. (507+145)/1250
  25. mean(glm.pred==Direction)
  26. train=(Year<2005)
  27. Smarket.2005=Smarket[!train,]
  28. dim(Smarket.2005)
  29. Direction.2005=Direction[!train]
  30. glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial,subset=train)
  31. glm.probs=predict(glm.fit,Smarket.2005,type="response")
  32. glm.pred=rep("Down",252)
  33. glm.pred[glm.probs>.5]="Up"
  34. table(glm.pred,Direction.2005)
  35. mean(glm.pred==Direction.2005)
  36. mean(glm.pred!=Direction.2005)
  37. glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,family=binomial,subset=train)
  38. glm.probs=predict(glm.fit,Smarket.2005,type="response")
  39. glm.pred=rep("Down",252)
  40. glm.pred[glm.probs>.5]="Up"
  41. table(glm.pred,Direction.2005)
  42. mean(glm.pred==Direction.2005)
  43. 106/(106+76)
  44. predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="response")

  45. # Linear Discriminant Analysis

  46. library(MASS)
  47. lda.fit=lda(Direction~Lag1+Lag2,data=Smarket,subset=train)
  48. lda.fit
  49. plot(lda.fit)
  50. lda.pred=predict(lda.fit, Smarket.2005)
  51. names(lda.pred)
  52. lda.class=lda.pred$class
  53. table(lda.class,Direction.2005)
  54. mean(lda.class==Direction.2005)
  55. sum(lda.pred$posterior[,1]>=.5)
  56. sum(lda.pred$posterior[,1]<.5)
  57. lda.pred$posterior[1:20,1]
  58. lda.class[1:20]
  59. sum(lda.pred$posterior[,1]>.9)

  60. # Quadratic Discriminant Analysis

  61. qda.fit=qda(Direction~Lag1+Lag2,data=Smarket,subset=train)
  62. qda.fit
  63. qda.class=predict(qda.fit,Smarket.2005)$class
  64. table(qda.class,Direction.2005)
  65. mean(qda.class==Direction.2005)

  66. # K-Nearest Neighbors

  67. library(class)
  68. train.X=cbind(Lag1,Lag2)[train,]
  69. test.X=cbind(Lag1,Lag2)[!train,]
  70. train.Direction=Direction[train]
  71. set.seed(1)
  72. knn.pred=knn(train.X,test.X,train.Direction,k=1)
  73. table(knn.pred,Direction.2005)
  74. (83+43)/252
  75. knn.pred=knn(train.X,test.X,train.Direction,k=3)
  76. table(knn.pred,Direction.2005)
  77. mean(knn.pred==Direction.2005)

  78. # An Application to Caravan Insurance Data

  79. dim(Caravan)
  80. attach(Caravan)
  81. summary(Purchase)
  82. 348/5822
  83. standardized.X=scale(Caravan[,-86])
  84. var(Caravan[,1])
  85. var(Caravan[,2])
  86. var(standardized.X[,1])
  87. var(standardized.X[,2])
  88. test=1:1000
  89. train.X=standardized.X[-test,]
  90. test.X=standardized.X[test,]
  91. train.Y=Purchase[-test]
  92. test.Y=Purchase[test]
  93. set.seed(1)
  94. knn.pred=knn(train.X,test.X,train.Y,k=1)
  95. mean(test.Y!=knn.pred)
  96. mean(test.Y!="No")
  97. table(knn.pred,test.Y)
  98. 9/(68+9)
  99. knn.pred=knn(train.X,test.X,train.Y,k=3)
  100. table(knn.pred,test.Y)
  101. 5/26
  102. knn.pred=knn(train.X,test.X,train.Y,k=5)
  103. table(knn.pred,test.Y)
  104. 4/15
  105. glm.fit=glm(Purchase~.,data=Caravan,family=binomial,subset=-test)
  106. glm.probs=predict(glm.fit,Caravan[test,],type="response")
  107. glm.pred=rep("No",1000)
  108. glm.pred[glm.probs>.5]="Yes"
  109. table(glm.pred,test.Y)
  110. glm.pred=rep("No",1000)
  111. glm.pred[glm.probs>.25]="Yes"
  112. table(glm.pred,test.Y)
  113. 11/(22+11)
复制代码


报纸
ReneeBK 发表于 2014-11-21 02:44:21
  1. # Chaper 5 Lab: Cross-Validation and the Bootstrap

  2. # The Validation Set Approach

  3. library(ISLR)
  4. set.seed(1)
  5. train=sample(392,196)
  6. lm.fit=lm(mpg~horsepower,data=Auto,subset=train)
  7. attach(Auto)
  8. mean((mpg-predict(lm.fit,Auto))[-train]^2)
  9. lm.fit2=lm(mpg~poly(horsepower,2),data=Auto,subset=train)
  10. mean((mpg-predict(lm.fit2,Auto))[-train]^2)
  11. lm.fit3=lm(mpg~poly(horsepower,3),data=Auto,subset=train)
  12. mean((mpg-predict(lm.fit3,Auto))[-train]^2)
  13. set.seed(2)
  14. train=sample(392,196)
  15. lm.fit=lm(mpg~horsepower,subset=train)
  16. mean((mpg-predict(lm.fit,Auto))[-train]^2)
  17. lm.fit2=lm(mpg~poly(horsepower,2),data=Auto,subset=train)
  18. mean((mpg-predict(lm.fit2,Auto))[-train]^2)
  19. lm.fit3=lm(mpg~poly(horsepower,3),data=Auto,subset=train)
  20. mean((mpg-predict(lm.fit3,Auto))[-train]^2)

  21. # Leave-One-Out Cross-Validation

  22. glm.fit=glm(mpg~horsepower,data=Auto)
  23. coef(glm.fit)
  24. lm.fit=lm(mpg~horsepower,data=Auto)
  25. coef(lm.fit)
  26. library(boot)
  27. glm.fit=glm(mpg~horsepower,data=Auto)
  28. cv.err=cv.glm(Auto,glm.fit)
  29. cv.err$delta
  30. cv.error=rep(0,5)
  31. for (i in 1:5){
  32. glm.fit=glm(mpg~poly(horsepower,i),data=Auto)
  33. cv.error[i]=cv.glm(Auto,glm.fit)$delta[1]
  34. }
  35. cv.error

  36. # k-Fold Cross-Validation

  37. set.seed(17)
  38. cv.error.10=rep(0,10)
  39. for (i in 1:10){
  40. glm.fit=glm(mpg~poly(horsepower,i),data=Auto)
  41. cv.error.10[i]=cv.glm(Auto,glm.fit,K=10)$delta[1]
  42. }
  43. cv.error.10

  44. # The Bootstrap

  45. alpha.fn=function(data,index){
  46. X=data$X[index]
  47. Y=data$Y[index]
  48. return((var(Y)-cov(X,Y))/(var(X)+var(Y)-2*cov(X,Y)))
  49. }
  50. alpha.fn(Portfolio,1:100)
  51. set.seed(1)
  52. alpha.fn(Portfolio,sample(100,100,replace=T))
  53. boot(Portfolio,alpha.fn,R=1000)

  54. # Estimating the Accuracy of a Linear Regression Model

  55. boot.fn=function(data,index)
  56. return(coef(lm(mpg~horsepower,data=data,subset=index)))
  57. boot.fn(Auto,1:392)
  58. set.seed(1)
  59. boot.fn(Auto,sample(392,392,replace=T))
  60. boot.fn(Auto,sample(392,392,replace=T))
  61. boot(Auto,boot.fn,1000)
  62. summary(lm(mpg~horsepower,data=Auto))$coef
  63. boot.fn=function(data,index)
  64. coefficients(lm(mpg~horsepower+I(horsepower^2),data=data,subset=index))
  65. set.seed(1)
  66. boot(Auto,boot.fn,1000)
  67. summary(lm(mpg~horsepower+I(horsepower^2),data=Auto))$coef
复制代码


地板
ReneeBK 发表于 2014-11-21 02:46:00
  1. # Chapter 6 Lab 1: Subset Selection Methods

  2. # Best Subset Selection

  3. library(ISLR)
  4. fix(Hitters)
  5. names(Hitters)
  6. dim(Hitters)
  7. sum(is.na(Hitters$Salary))
  8. Hitters=na.omit(Hitters)
  9. dim(Hitters)
  10. sum(is.na(Hitters))
  11. library(leaps)
  12. regfit.full=regsubsets(Salary~.,Hitters)
  13. summary(regfit.full)
  14. regfit.full=regsubsets(Salary~.,data=Hitters,nvmax=19)
  15. reg.summary=summary(regfit.full)
  16. names(reg.summary)
  17. reg.summary$rsq
  18. par(mfrow=c(2,2))
  19. plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
  20. plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
  21. which.max(reg.summary$adjr2)
  22. points(11,reg.summary$adjr2[11], col="red",cex=2,pch=20)
  23. plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
  24. which.min(reg.summary$cp)
  25. points(10,reg.summary$cp[10],col="red",cex=2,pch=20)
  26. which.min(reg.summary$bic)
  27. plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
  28. points(6,reg.summary$bic[6],col="red",cex=2,pch=20)
  29. plot(regfit.full,scale="r2")
  30. plot(regfit.full,scale="adjr2")
  31. plot(regfit.full,scale="Cp")
  32. plot(regfit.full,scale="bic")
  33. coef(regfit.full,6)

  34. # Forward and Backward Stepwise Selection

  35. regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="forward")
  36. summary(regfit.fwd)
  37. regfit.bwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="backward")
  38. summary(regfit.bwd)
  39. coef(regfit.full,7)
  40. coef(regfit.fwd,7)
  41. coef(regfit.bwd,7)

  42. # Choosing Among Models

  43. set.seed(1)
  44. train=sample(c(TRUE,FALSE), nrow(Hitters),rep=TRUE)
  45. test=(!train)
  46. regfit.best=regsubsets(Salary~.,data=Hitters[train,],nvmax=19)
  47. test.mat=model.matrix(Salary~.,data=Hitters[test,])
  48. val.errors=rep(NA,19)
  49. for(i in 1:19){
  50.    coefi=coef(regfit.best,id=i)
  51.    pred=test.mat[,names(coefi)]%*%coefi
  52.    val.errors[i]=mean((Hitters$Salary[test]-pred)^2)
  53. }
  54. val.errors
  55. which.min(val.errors)
  56. coef(regfit.best,10)
  57. predict.regsubsets=function(object,newdata,id,...){
  58.   form=as.formula(object$call[[2]])
  59.   mat=model.matrix(form,newdata)
  60.   coefi=coef(object,id=id)
  61.   xvars=names(coefi)
  62.   mat[,xvars]%*%coefi
  63.   }
  64. regfit.best=regsubsets(Salary~.,data=Hitters,nvmax=19)
  65. coef(regfit.best,10)
  66. k=10
  67. set.seed(1)
  68. folds=sample(1:k,nrow(Hitters),replace=TRUE)
  69. cv.errors=matrix(NA,k,19, dimnames=list(NULL, paste(1:19)))
  70. for(j in 1:k){
  71.   best.fit=regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19)
  72.   for(i in 1:19){
  73.     pred=predict(best.fit,Hitters[folds==j,],id=i)
  74.     cv.errors[j,i]=mean( (Hitters$Salary[folds==j]-pred)^2)
  75.     }
  76.   }
  77. mean.cv.errors=apply(cv.errors,2,mean)
  78. mean.cv.errors
  79. par(mfrow=c(1,1))
  80. plot(mean.cv.errors,type='b')
  81. reg.best=regsubsets(Salary~.,data=Hitters, nvmax=19)
  82. coef(reg.best,11)


  83. # Chapter 6 Lab 2: Ridge Regression and the Lasso

  84. x=model.matrix(Salary~.,Hitters)[,-1]
  85. y=Hitters$Salary

  86. # Ridge Regression

  87. library(glmnet)
  88. grid=10^seq(10,-2,length=100)
  89. ridge.mod=glmnet(x,y,alpha=0,lambda=grid)
  90. dim(coef(ridge.mod))
  91. ridge.mod$lambda[50]
  92. coef(ridge.mod)[,50]
  93. sqrt(sum(coef(ridge.mod)[-1,50]^2))
  94. ridge.mod$lambda[60]
  95. coef(ridge.mod)[,60]
  96. sqrt(sum(coef(ridge.mod)[-1,60]^2))
  97. predict(ridge.mod,s=50,type="coefficients")[1:20,]
  98. set.seed(1)
  99. train=sample(1:nrow(x), nrow(x)/2)
  100. test=(-train)
  101. y.test=y[test]
  102. ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)
  103. ridge.pred=predict(ridge.mod,s=4,newx=x[test,])
  104. mean((ridge.pred-y.test)^2)
  105. mean((mean(y[train])-y.test)^2)
  106. ridge.pred=predict(ridge.mod,s=1e10,newx=x[test,])
  107. mean((ridge.pred-y.test)^2)
  108. ridge.pred=predict(ridge.mod,s=0,newx=x[test,],exact=T)
  109. mean((ridge.pred-y.test)^2)
  110. lm(y~x, subset=train)
  111. predict(ridge.mod,s=0,exact=T,type="coefficients")[1:20,]
  112. set.seed(1)
  113. cv.out=cv.glmnet(x[train,],y[train],alpha=0)
  114. plot(cv.out)
  115. bestlam=cv.out$lambda.min
  116. bestlam
  117. ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
  118. mean((ridge.pred-y.test)^2)
  119. out=glmnet(x,y,alpha=0)
  120. predict(out,type="coefficients",s=bestlam)[1:20,]

  121. # The Lasso

  122. lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
  123. plot(lasso.mod)
  124. set.seed(1)
  125. cv.out=cv.glmnet(x[train,],y[train],alpha=1)
  126. plot(cv.out)
  127. bestlam=cv.out$lambda.min
  128. lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
  129. mean((lasso.pred-y.test)^2)
  130. out=glmnet(x,y,alpha=1,lambda=grid)
  131. lasso.coef=predict(out,type="coefficients",s=bestlam)[1:20,]
  132. lasso.coef
  133. lasso.coef[lasso.coef!=0]


  134. # Chapter 6 Lab 3: PCR and PLS Regression

  135. # Principal Components Regression

  136. library(pls)
  137. set.seed(2)
  138. pcr.fit=pcr(Salary~., data=Hitters,scale=TRUE,validation="CV")
  139. summary(pcr.fit)
  140. validationplot(pcr.fit,val.type="MSEP")
  141. set.seed(1)
  142. pcr.fit=pcr(Salary~., data=Hitters,subset=train,scale=TRUE, validation="CV")
  143. validationplot(pcr.fit,val.type="MSEP")
  144. pcr.pred=predict(pcr.fit,x[test,],ncomp=7)
  145. mean((pcr.pred-y.test)^2)
  146. pcr.fit=pcr(y~x,scale=TRUE,ncomp=7)
  147. summary(pcr.fit)

  148. # Partial Least Squares

  149. set.seed(1)
  150. pls.fit=plsr(Salary~., data=Hitters,subset=train,scale=TRUE, validation="CV")
  151. summary(pls.fit)
  152. validationplot(pls.fit,val.type="MSEP")
  153. pls.pred=predict(pls.fit,x[test,],ncomp=2)
  154. mean((pls.pred-y.test)^2)
  155. pls.fit=plsr(Salary~., data=Hitters,scale=TRUE,ncomp=2)
  156. summary(pls.fit)
复制代码


7
ReneeBK 发表于 2014-11-21 02:47:41
  1. # Chapter 7 Lab: Non-linear Modeling

  2. library(ISLR)
  3. attach(Wage)

  4. # Polynomial Regression and Step Functions

  5. fit=lm(wage~poly(age,4),data=Wage)
  6. coef(summary(fit))
  7. fit2=lm(wage~poly(age,4,raw=T),data=Wage)
  8. coef(summary(fit2))
  9. fit2a=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage)
  10. coef(fit2a)
  11. fit2b=lm(wage~cbind(age,age^2,age^3,age^4),data=Wage)
  12. agelims=range(age)
  13. age.grid=seq(from=agelims[1],to=agelims[2])
  14. preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
  15. se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
  16. par(mfrow=c(1,2),mar=c(4.5,4.5,1,1),oma=c(0,0,4,0))
  17. plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
  18. title("Degree-4 Polynomial",outer=T)
  19. lines(age.grid,preds$fit,lwd=2,col="blue")
  20. matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)
  21. preds2=predict(fit2,newdata=list(age=age.grid),se=TRUE)
  22. max(abs(preds$fit-preds2$fit))
  23. fit.1=lm(wage~age,data=Wage)
  24. fit.2=lm(wage~poly(age,2),data=Wage)
  25. fit.3=lm(wage~poly(age,3),data=Wage)
  26. fit.4=lm(wage~poly(age,4),data=Wage)
  27. fit.5=lm(wage~poly(age,5),data=Wage)
  28. anova(fit.1,fit.2,fit.3,fit.4,fit.5)
  29. coef(summary(fit.5))
  30. (-11.983)^2
  31. fit.1=lm(wage~education+age,data=Wage)
  32. fit.2=lm(wage~education+poly(age,2),data=Wage)
  33. fit.3=lm(wage~education+poly(age,3),data=Wage)
  34. anova(fit.1,fit.2,fit.3)
  35. fit=glm(I(wage>250)~poly(age,4),data=Wage,family=binomial)
  36. preds=predict(fit,newdata=list(age=age.grid),se=T)
  37. pfit=exp(preds$fit)/(1+exp(preds$fit))
  38. se.bands.logit = cbind(preds$fit+2*preds$se.fit, preds$fit-2*preds$se.fit)
  39. se.bands = exp(se.bands.logit)/(1+exp(se.bands.logit))
  40. preds=predict(fit,newdata=list(age=age.grid),type="response",se=T)
  41. plot(age,I(wage>250),xlim=agelims,type="n",ylim=c(0,.2))
  42. points(jitter(age), I((wage>250)/5),cex=.5,pch="|",col="darkgrey")
  43. lines(age.grid,pfit,lwd=2, col="blue")
  44. matlines(age.grid,se.bands,lwd=1,col="blue",lty=3)
  45. table(cut(age,4))
  46. fit=lm(wage~cut(age,4),data=Wage)
  47. coef(summary(fit))

  48. # Splines

  49. library(splines)
  50. fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage)
  51. pred=predict(fit,newdata=list(age=age.grid),se=T)
  52. plot(age,wage,col="gray")
  53. lines(age.grid,pred$fit,lwd=2)
  54. lines(age.grid,pred$fit+2*pred$se,lty="dashed")
  55. lines(age.grid,pred$fit-2*pred$se,lty="dashed")
  56. dim(bs(age,knots=c(25,40,60)))
  57. dim(bs(age,df=6))
  58. attr(bs(age,df=6),"knots")
  59. fit2=lm(wage~ns(age,df=4),data=Wage)
  60. pred2=predict(fit2,newdata=list(age=age.grid),se=T)
  61. lines(age.grid, pred2$fit,col="red",lwd=2)
  62. plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
  63. title("Smoothing Spline")
  64. fit=smooth.spline(age,wage,df=16)
  65. fit2=smooth.spline(age,wage,cv=TRUE)
  66. fit2$df
  67. lines(fit,col="red",lwd=2)
  68. lines(fit2,col="blue",lwd=2)
  69. legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8)
  70. plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
  71. title("Local Regression")
  72. fit=loess(wage~age,span=.2,data=Wage)
  73. fit2=loess(wage~age,span=.5,data=Wage)
  74. lines(age.grid,predict(fit,data.frame(age=age.grid)),col="red",lwd=2)
  75. lines(age.grid,predict(fit2,data.frame(age=age.grid)),col="blue",lwd=2)
  76. legend("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=.8)

  77. # GAMs

  78. gam1=lm(wage~ns(year,4)+ns(age,5)+education,data=Wage)
  79. library(gam)
  80. gam.m3=gam(wage~s(year,4)+s(age,5)+education,data=Wage)
  81. par(mfrow=c(1,3))
  82. plot(gam.m3, se=TRUE,col="blue")
  83. plot.gam(gam1, se=TRUE, col="red")
  84. gam.m1=gam(wage~s(age,5)+education,data=Wage)
  85. gam.m2=gam(wage~year+s(age,5)+education,data=Wage)
  86. anova(gam.m1,gam.m2,gam.m3,test="F")
  87. summary(gam.m3)
  88. preds=predict(gam.m2,newdata=Wage)
  89. gam.lo=gam(wage~s(year,df=4)+lo(age,span=0.7)+education,data=Wage)
  90. plot.gam(gam.lo, se=TRUE, col="green")
  91. gam.lo.i=gam(wage~lo(year,age,span=0.5)+education,data=Wage)
  92. library(akima)
  93. plot(gam.lo.i)
  94. gam.lr=gam(I(wage>250)~year+s(age,df=5)+education,family=binomial,data=Wage)
  95. par(mfrow=c(1,3))
  96. plot(gam.lr,se=T,col="green")
  97. table(education,I(wage>250))
  98. gam.lr.s=gam(I(wage>250)~year+s(age,df=5)+education,family=binomial,data=Wage,subset=(education!="1. < HS Grad"))
  99. plot(gam.lr.s,se=T,col="green")
复制代码


8
ReneeBK 发表于 2014-11-21 02:50:48
  1. # Chapter 8 Lab: Decision Trees

  2. # Fitting Classification Trees

  3. library(tree)
  4. library(ISLR)
  5. attach(Carseats)
  6. High=ifelse(Sales<=8,"No","Yes")
  7. Carseats=data.frame(Carseats,High)
  8. tree.carseats=tree(High~.-Sales,Carseats)
  9. summary(tree.carseats)
  10. plot(tree.carseats)
  11. text(tree.carseats,pretty=0)
  12. tree.carseats
  13. set.seed(2)
  14. train=sample(1:nrow(Carseats), 200)
  15. Carseats.test=Carseats[-train,]
  16. High.test=High[-train]
  17. tree.carseats=tree(High~.-Sales,Carseats,subset=train)
  18. tree.pred=predict(tree.carseats,Carseats.test,type="class")
  19. table(tree.pred,High.test)
  20. (86+57)/200
  21. set.seed(3)
  22. cv.carseats=cv.tree(tree.carseats,FUN=prune.misclass)
  23. names(cv.carseats)
  24. cv.carseats
  25. par(mfrow=c(1,2))
  26. plot(cv.carseats$size,cv.carseats$dev,type="b")
  27. plot(cv.carseats$k,cv.carseats$dev,type="b")
  28. prune.carseats=prune.misclass(tree.carseats,best=9)
  29. plot(prune.carseats)
  30. text(prune.carseats,pretty=0)
  31. tree.pred=predict(prune.carseats,Carseats.test,type="class")
  32. table(tree.pred,High.test)
  33. (94+60)/200
  34. prune.carseats=prune.misclass(tree.carseats,best=15)
  35. plot(prune.carseats)
  36. text(prune.carseats,pretty=0)
  37. tree.pred=predict(prune.carseats,Carseats.test,type="class")
  38. table(tree.pred,High.test)
  39. (86+62)/200

  40. # Fitting Regression Trees

  41. library(MASS)
  42. set.seed(1)
  43. train = sample(1:nrow(Boston), nrow(Boston)/2)
  44. tree.boston=tree(medv~.,Boston,subset=train)
  45. summary(tree.boston)
  46. plot(tree.boston)
  47. text(tree.boston,pretty=0)
  48. cv.boston=cv.tree(tree.boston)
  49. plot(cv.boston$size,cv.boston$dev,type='b')
  50. prune.boston=prune.tree(tree.boston,best=5)
  51. plot(prune.boston)
  52. text(prune.boston,pretty=0)
  53. yhat=predict(tree.boston,newdata=Boston[-train,])
  54. boston.test=Boston[-train,"medv"]
  55. plot(yhat,boston.test)
  56. abline(0,1)
  57. mean((yhat-boston.test)^2)

  58. # Bagging and Random Forests

  59. library(randomForest)
  60. set.seed(1)
  61. bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,importance=TRUE)
  62. bag.boston
  63. yhat.bag = predict(bag.boston,newdata=Boston[-train,])
  64. plot(yhat.bag, boston.test)
  65. abline(0,1)
  66. mean((yhat.bag-boston.test)^2)
  67. bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,ntree=25)
  68. yhat.bag = predict(bag.boston,newdata=Boston[-train,])
  69. mean((yhat.bag-boston.test)^2)
  70. set.seed(1)
  71. rf.boston=randomForest(medv~.,data=Boston,subset=train,mtry=6,importance=TRUE)
  72. yhat.rf = predict(rf.boston,newdata=Boston[-train,])
  73. mean((yhat.rf-boston.test)^2)
  74. importance(rf.boston)
  75. varImpPlot(rf.boston)

  76. # Boosting

  77. library(gbm)
  78. set.seed(1)
  79. boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4)
  80. summary(boost.boston)
  81. par(mfrow=c(1,2))
  82. plot(boost.boston,i="rm")
  83. plot(boost.boston,i="lstat")
  84. yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000)
  85. mean((yhat.boost-boston.test)^2)
  86. boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4,shrinkage=0.2,verbose=F)
  87. yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000)
  88. mean((yhat.boost-boston.test)^2)
复制代码


9
ReneeBK 发表于 2014-11-21 03:03:03
  1. # Chapter 9 Lab: Support Vector Machines

  2. # Support Vector Classifier

  3. set.seed(1)
  4. x=matrix(rnorm(20*2), ncol=2)
  5. y=c(rep(-1,10), rep(1,10))
  6. x[y==1,]=x[y==1,] + 1
  7. plot(x, col=(3-y))
  8. dat=data.frame(x=x, y=as.factor(y))
  9. library(e1071)
  10. svmfit=svm(y~., data=dat, kernel="linear", cost=10,scale=FALSE)
  11. plot(svmfit, dat)
  12. svmfit$index
  13. summary(svmfit)
  14. svmfit=svm(y~., data=dat, kernel="linear", cost=0.1,scale=FALSE)
  15. plot(svmfit, dat)
  16. svmfit$index
  17. set.seed(1)
  18. tune.out=tune(svm,y~.,data=dat,kernel="linear",ranges=list(cost=c(0.001, 0.01, 0.1, 1,5,10,100)))
  19. summary(tune.out)
  20. bestmod=tune.out$best.model
  21. summary(bestmod)
  22. xtest=matrix(rnorm(20*2), ncol=2)
  23. ytest=sample(c(-1,1), 20, rep=TRUE)
  24. xtest[ytest==1,]=xtest[ytest==1,] + 1
  25. testdat=data.frame(x=xtest, y=as.factor(ytest))
  26. ypred=predict(bestmod,testdat)
  27. table(predict=ypred, truth=testdat$y)
  28. svmfit=svm(y~., data=dat, kernel="linear", cost=.01,scale=FALSE)
  29. ypred=predict(svmfit,testdat)
  30. table(predict=ypred, truth=testdat$y)
  31. x[y==1,]=x[y==1,]+0.5
  32. plot(x, col=(y+5)/2, pch=19)
  33. dat=data.frame(x=x,y=as.factor(y))
  34. svmfit=svm(y~., data=dat, kernel="linear", cost=1e5)
  35. summary(svmfit)
  36. plot(svmfit, dat)
  37. svmfit=svm(y~., data=dat, kernel="linear", cost=1)
  38. summary(svmfit)
  39. plot(svmfit,dat)

  40. # Support Vector Machine

  41. set.seed(1)
  42. x=matrix(rnorm(200*2), ncol=2)
  43. x[1:100,]=x[1:100,]+2
  44. x[101:150,]=x[101:150,]-2
  45. y=c(rep(1,150),rep(2,50))
  46. dat=data.frame(x=x,y=as.factor(y))
  47. plot(x, col=y)
  48. train=sample(200,100)
  49. svmfit=svm(y~., data=dat[train,], kernel="radial",  gamma=1, cost=1)
  50. plot(svmfit, dat[train,])
  51. summary(svmfit)
  52. svmfit=svm(y~., data=dat[train,], kernel="radial",gamma=1,cost=1e5)
  53. plot(svmfit,dat[train,])
  54. set.seed(1)
  55. tune.out=tune(svm, y~., data=dat[train,], kernel="radial", ranges=list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4)))
  56. summary(tune.out)
  57. table(true=dat[-train,"y"], pred=predict(tune.out$best.model,newx=dat[-train,]))

  58. # ROC Curves

  59. library(ROCR)
  60. rocplot=function(pred, truth, ...){
  61.    predob = prediction(pred, truth)
  62.    perf = performance(predob, "tpr", "fpr")
  63.    plot(perf,...)}
  64. svmfit.opt=svm(y~., data=dat[train,], kernel="radial",gamma=2, cost=1,decision.values=T)
  65. fitted=attributes(predict(svmfit.opt,dat[train,],decision.values=TRUE))$decision.values
  66. par(mfrow=c(1,2))
  67. rocplot(fitted,dat[train,"y"],main="Training Data")
  68. svmfit.flex=svm(y~., data=dat[train,], kernel="radial",gamma=50, cost=1, decision.values=T)
  69. fitted=attributes(predict(svmfit.flex,dat[train,],decision.values=T))$decision.values
  70. rocplot(fitted,dat[train,"y"],add=T,col="red")
  71. fitted=attributes(predict(svmfit.opt,dat[-train,],decision.values=T))$decision.values
  72. rocplot(fitted,dat[-train,"y"],main="Test Data")
  73. fitted=attributes(predict(svmfit.flex,dat[-train,],decision.values=T))$decision.values
  74. rocplot(fitted,dat[-train,"y"],add=T,col="red")

  75. # SVM with Multiple Classes

  76. set.seed(1)
  77. x=rbind(x, matrix(rnorm(50*2), ncol=2))
  78. y=c(y, rep(0,50))
  79. x[y==0,2]=x[y==0,2]+2
  80. dat=data.frame(x=x, y=as.factor(y))
  81. par(mfrow=c(1,1))
  82. plot(x,col=(y+1))
  83. svmfit=svm(y~., data=dat, kernel="radial", cost=10, gamma=1)
  84. plot(svmfit, dat)

  85. # Application to Gene Expression Data

  86. library(ISLR)
  87. names(Khan)
  88. dim(Khan$xtrain)
  89. dim(Khan$xtest)
  90. length(Khan$ytrain)
  91. length(Khan$ytest)
  92. table(Khan$ytrain)
  93. table(Khan$ytest)
  94. dat=data.frame(x=Khan$xtrain, y=as.factor(Khan$ytrain))
  95. out=svm(y~., data=dat, kernel="linear",cost=10)
  96. summary(out)
  97. table(out$fitted, dat$y)
  98. dat.te=data.frame(x=Khan$xtest, y=as.factor(Khan$ytest))
  99. pred.te=predict(out, newdata=dat.te)
  100. table(pred.te, dat.te$y)
复制代码


10
ReneeBK 发表于 2014-11-21 03:12:59
  1. # Chapter 10 Lab 1: Principal Components Analysis

  2. states=row.names(USArrests)
  3. states
  4. names(USArrests)
  5. apply(USArrests, 2, mean)
  6. apply(USArrests, 2, var)
  7. pr.out=prcomp(USArrests, scale=TRUE)
  8. names(pr.out)
  9. pr.out$center
  10. pr.out$scale
  11. pr.out$rotation
  12. dim(pr.out$x)
  13. biplot(pr.out, scale=0)
  14. pr.out$rotation=-pr.out$rotation
  15. pr.out$x=-pr.out$x
  16. biplot(pr.out, scale=0)
  17. pr.out$sdev
  18. pr.var=pr.out$sdev^2
  19. pr.var
  20. pve=pr.var/sum(pr.var)
  21. pve
  22. plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')
  23. plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')
  24. a=c(1,2,8,-3)
  25. cumsum(a)


  26. # Chapter 10 Lab 2: Clustering

  27. # K-Means Clustering

  28. set.seed(2)
  29. x=matrix(rnorm(50*2), ncol=2)
  30. x[1:25,1]=x[1:25,1]+3
  31. x[1:25,2]=x[1:25,2]-4
  32. km.out=kmeans(x,2,nstart=20)
  33. km.out$cluster
  34. plot(x, col=(km.out$cluster+1), main="K-Means Clustering Results with K=2", xlab="", ylab="", pch=20, cex=2)
  35. set.seed(4)
  36. km.out=kmeans(x,3,nstart=20)
  37. km.out
  38. plot(x, col=(km.out$cluster+1), main="K-Means Clustering Results with K=3", xlab="", ylab="", pch=20, cex=2)
  39. set.seed(3)
  40. km.out=kmeans(x,3,nstart=1)
  41. km.out$tot.withinss
  42. km.out=kmeans(x,3,nstart=20)
  43. km.out$tot.withinss

  44. # Hierarchical Clustering

  45. hc.complete=hclust(dist(x), method="complete")
  46. hc.average=hclust(dist(x), method="average")
  47. hc.single=hclust(dist(x), method="single")
  48. par(mfrow=c(1,3))
  49. plot(hc.complete,main="Complete Linkage", xlab="", sub="", cex=.9)
  50. plot(hc.average, main="Average Linkage", xlab="", sub="", cex=.9)
  51. plot(hc.single, main="Single Linkage", xlab="", sub="", cex=.9)
  52. cutree(hc.complete, 2)
  53. cutree(hc.average, 2)
  54. cutree(hc.single, 2)
  55. cutree(hc.single, 4)
  56. xsc=scale(x)
  57. plot(hclust(dist(xsc), method="complete"), main="Hierarchical Clustering with Scaled Features")
  58. x=matrix(rnorm(30*3), ncol=3)
  59. dd=as.dist(1-cor(t(x)))
  60. plot(hclust(dd, method="complete"), main="Complete Linkage with Correlation-Based Distance", xlab="", sub="")


  61. # Chapter 10 Lab 3: NCI60 Data Example

  62. # The NCI60 data

  63. library(ISLR)
  64. nci.labs=NCI60$labs
  65. nci.data=NCI60$data
  66. dim(nci.data)
  67. nci.labs[1:4]
  68. table(nci.labs)

  69. # PCA on the NCI60 Data

  70. pr.out=prcomp(nci.data, scale=TRUE)
  71. Cols=function(vec){
  72.     cols=rainbow(length(unique(vec)))
  73.     return(cols[as.numeric(as.factor(vec))])
  74.   }
  75. par(mfrow=c(1,2))
  76. plot(pr.out$x[,1:2], col=Cols(nci.labs), pch=19,xlab="Z1",ylab="Z2")
  77. plot(pr.out$x[,c(1,3)], col=Cols(nci.labs), pch=19,xlab="Z1",ylab="Z3")
  78. summary(pr.out)
  79. plot(pr.out)
  80. pve=100*pr.out$sdev^2/sum(pr.out$sdev^2)
  81. par(mfrow=c(1,2))
  82. plot(pve,  type="o", ylab="PVE", xlab="Principal Component", col="blue")
  83. plot(cumsum(pve), type="o", ylab="Cumulative PVE", xlab="Principal Component", col="brown3")

  84. # Clustering the Observations of the NCI60 Data

  85. sd.data=scale(nci.data)
  86. par(mfrow=c(1,3))
  87. data.dist=dist(sd.data)
  88. plot(hclust(data.dist), labels=nci.labs, main="Complete Linkage", xlab="", sub="",ylab="")
  89. plot(hclust(data.dist, method="average"), labels=nci.labs, main="Average Linkage", xlab="", sub="",ylab="")
  90. plot(hclust(data.dist, method="single"), labels=nci.labs,  main="Single Linkage", xlab="", sub="",ylab="")
  91. hc.out=hclust(dist(sd.data))
  92. hc.clusters=cutree(hc.out,4)
  93. table(hc.clusters,nci.labs)
  94. par(mfrow=c(1,1))
  95. plot(hc.out, labels=nci.labs)
  96. abline(h=139, col="red")
  97. hc.out
  98. set.seed(2)
  99. km.out=kmeans(sd.data, 4, nstart=20)
  100. km.clusters=km.out$cluster
  101. table(km.clusters,hc.clusters)
  102. hc.out=hclust(dist(pr.out$x[,1:5]))
  103. plot(hc.out, labels=nci.labs, main="Hier. Clust. on First Five Score Vectors")
  104. table(cutree(hc.out,4), nci.labs)
复制代码


您需要登录后才可以回帖 登录 | 我要注册

本版微信群
jg-xs1
拉您进交流群
GMT+8, 2025-12-31 10:10