楼主: Lisrelchen
5374 41

[Lecture Notes]Genevera Allen:Statistical Learning using R and Matlab [推广有奖]

11
Lisrelchen(未真实交易用户) 发表于 2016-5-29 22:21:05
  1. %%%%%%%%%%%%%%
  2. %lecture 11 code
  3. %matlab script - svm exmaple
  4. %primal form problems

  5. cd ~/cvx
  6. cvx_setup
  7. cd ~/classes/stat640_2015/


  8. %%%%%%%%%%%
  9. %separable case - optimal separating hyperplanes

  10. clf
  11. var = 1.25;

  12. n = 100; mu1 = [3; 8]; mu2 = [8; 3];
  13. x1 = repmat(mu1',n/2,1) + randn(n/2,2)*sqrt(var);
  14. x2 = repmat(mu2',n/2,1) + randn(n/2,2)*sqrt(var);
  15. x = [x1; x2];
  16. Y = [repmat(1,50,1); repmat(-1,50,1)];

  17. hold on
  18. scatter(x1(:,1),x1(:,2),'or','filled')
  19. scatter(x2(:,1),x2(:,2),'ob','filled')
  20. axis([min(min(x(:,1))) max(max(x(:,1))) min(min(x(:,2))) max(max(x(:,2)))]);
  21. hold off

  22. cvx_begin
  23. cvx_precision('high')
  24. variable beta0;
  25. variable beta(2);
  26. minimize( .5*square_pos(norm(beta)));
  27. subject to
  28. Y.*(x*beta + beta0) >= 1;
  29. cvx_end


  30. hold on
  31. scatter(x1(:,1),x1(:,2),'or','filled')
  32. scatter(x2(:,1),x2(:,2),'ob','filled')
  33. axis([min(min(x(:,1))) max(max(x(:,1))) min(min(x(:,2))) max(max(x(:,2)))]);
  34. xs = linspace(min(min(x)),max(max(x)),1000);
  35. plot(xs,(-beta0 - xs*beta(1))/beta(2),'k')
  36. M = 1/norm(beta);
  37. plot(xs,(-beta0 - xs*beta(1)+1)/beta(2) ,'--k')
  38. plot(xs,(-beta0 - xs*beta(1)-1)/beta(2),'--k')
  39. hold off


  40. %%%%%%%%%%%%%%
  41. %non-seperable case - linear SVMs

  42. clf
  43. var = 2.8;

  44. n = 100; mu1 = [3; 8]; mu2 = [8; 3];
  45. x1 = repmat(mu1',n/2,1) + randn(n/2,2)*sqrt(var);
  46. x2 = repmat(mu2',n/2,1) + randn(n/2,2)*sqrt(var);
  47. x = [x1; x2];
  48. Y = [repmat(1,50,1); repmat(-1,50,1)];

  49. hold on
  50. scatter(x1(:,1),x1(:,2),'or','filled')
  51. scatter(x2(:,1),x2(:,2),'ob','filled')
  52. axis([min(min(x(:,1))) max(max(x(:,1))) min(min(x(:,2))) max(max(x(:,2)))]);
  53. hold off


  54. gam = 5;

  55. cvx_begin
  56. cvx_precision('high')
  57. variable beta0;
  58. variable beta(2);
  59. variable epsilon(n);
  60. minimize( .5*square_pos(norm(beta)) + gam*sum(epsilon));
  61. subject to
  62. for i=1:n
  63.     Y(i)*(x(i,:)*beta + beta0) >= 1 - epsilon(i);
  64. end
  65. epsilon >= 0;
  66. cvx_end

  67. clf
  68. hold on
  69. scatter(x1(:,1),x1(:,2),'or','filled')
  70. scatter(x2(:,1),x2(:,2),'ob','filled')
  71. axis([min(min(x(:,1))) max(max(x(:,1))) min(min(x(:,2))) max(max(x(:,2)))]);
  72. xs = linspace(min(min(x)),max(max(x)),1000);
  73. plot(xs,(-beta0 - xs*beta(1))/beta(2),'k')
  74. M = 1/norm(beta);
  75. plot(xs,(-beta0 - xs*beta(1)+1)/beta(2) ,'--k')
  76. plot(xs,(-beta0 - xs*beta(1)-1)/beta(2),'--k')
  77. ind = epsilon>1e-6;
  78. scatter(x(ind,1),x(ind,2),'+k','SizeData',150)
  79. ind = abs(x*beta + beta0 + 1)<1e-4;
  80. scatter(x(ind,1),x(ind,2),'Xk','SizeData',150)
  81. ind = abs(x*beta + beta0 - 1)<1e-4;
  82. scatter(x(ind,1),x(ind,2),'Xk','SizeData',150)
  83. hold off




















  84. %%%%%%%%%%%%%%
  85. %lecture 11 code
  86. %matlab script - svm exmaple
  87. %primal form problems

  88. cd ~/cvx
  89. cvx_setup
  90. cd ~/classes/stat640_2014/


  91. %%%%%%%%%%%%%%
  92. %non-seperable case

  93. clf
  94. var = 2.8;

  95. n = 100; mu1 = [3; 8]; mu2 = [8; 3];
  96. x1 = repmat(mu1',n/2,1) + randn(n/2,2)*sqrt(var);
  97. x2 = repmat(mu2',n/2,1) + randn(n/2,2)*sqrt(var);
  98. x = [x1; x2];
  99. Y = [repmat(1,50,1); repmat(-1,50,1)];

  100. hold on
  101. scatter(x1(:,1),x1(:,2),'or','filled')
  102. scatter(x2(:,1),x2(:,2),'ob','filled')
  103. axis([min(min(x(:,1))) max(max(x(:,1))) min(min(x(:,2))) max(max(x(:,2)))]);
  104. hold off


  105. gam = .1;

  106. cvx_begin
  107. cvx_precision('high')
  108. variable beta0;
  109. variable beta(2);
  110. variable epsilon(n);
  111. minimize( .5*square_pos(norm(beta)) + gam*sum(epsilon));
  112. subject to
  113. for i=1:n
  114.     Y(i)*(x(i,:)*beta + beta0) >= 1 - epsilon(i);
  115. end
  116. epsilon >= 0;
  117. cvx_end

  118. clf
  119. hold on
  120. scatter(x1(:,1),x1(:,2),'or','filled')
  121. scatter(x2(:,1),x2(:,2),'ob','filled')
  122. axis([min(min(x(:,1))) max(max(x(:,1))) min(min(x(:,2))) max(max(x(:,2)))]);
  123. xs = linspace(min(min(x)),max(max(x)),1000);
  124. plot(xs,(-beta0 - xs*beta(1))/beta(2),'k')
  125. M = 1/norm(beta);
  126. plot(xs,(-beta0 - xs*beta(1)+1)/beta(2) ,'--k')
  127. plot(xs,(-beta0 - xs*beta(1)-1)/beta(2),'--k')
  128. ind = epsilon>1e-6;
  129. scatter(x(ind,1),x(ind,2),'+k','SizeData',150)
  130. ind = abs(x*beta + beta0 + 1)<1e-4;
  131. scatter(x(ind,1),x(ind,2),'Xk','SizeData',150)
  132. ind = abs(x*beta + beta0 - 1)<1e-4;
  133. scatter(x(ind,1),x(ind,2),'Xk','SizeData',150)
  134. hold off

  135. %%%%%%%%%%%%%%%%%%%%%
  136. %zip code example
  137. %pair-wise

  138. zip1 = 3; zip2 = 8;

  139. trdat = dlmread('zip.train');
  140. ind = trdat(:,1)==zip1 | trdat(:,1)==zip2;
  141. X = trdat(ind,2:end);
  142. Y = trdat(ind,1);
  143. [n,p] = size(X);
  144. tsdat = dlmread('zip.test');
  145. ind = tsdat(:,1)==zip1 | tsdat(:,1)==zip2;
  146. Xs = tsdat(ind,2:end);
  147. Ys = tsdat(ind,1);

  148. C = .01;
  149. fitsvm = svmtrain(X,Y,'boxconstraint',C);
  150. class = svmclassify(fitsvm,X);
  151. trerr = sum(class~=Y)/length(Y);
  152. class = svmclassify(fitsvm,Xs);
  153. tserr = sum(class~=Ys)/length(Ys);
  154. [trerr tserr]
  155. fitsvm

  156. colormap(gray)
  157. for i=1:9
  158.     subplot(3,3,i)
  159.     imagesc(flipud(rot90(reshape(fitsvm.SupportVectors(i,:),16,16))))
  160.     axis off
  161. end


  162. [classNB,errNB] = classify(Xs,X,Y,'diaglinear');
  163. [classLDA,errLDA] = classify(Xs,X,Y,'linear');
  164. [errNB errLDA tserr]
复制代码

12
Lisrelchen(未真实交易用户) 发表于 2016-5-29 22:25:31
  1. %%%%%%%%%%%%%%%%%%%%%%
  2. %lecture 12 code


  3. %zip code data - 3's and 8's

  4. zip1 = 3; zip2 = 8;

  5. trdat = dlmread('zip.train');
  6. ind = trdat(:,1)==zip1 | trdat(:,1)==zip2;
  7. X = trdat(ind,2:end);
  8. Y = trdat(ind,1);
  9. [n,p] = size(X);
  10. tsdat = dlmread('zip.test');
  11. ind = tsdat(:,1)==zip1 | tsdat(:,1)==zip2;
  12. Xs = tsdat(ind,2:end);
  13. Ys = tsdat(ind,1);

  14. %C-SVM - larger C gives a smaller margin (inversely propotional to gamma)

  15. C = 1;
  16. fitsvm = svmtrain(X,Y,'boxconstraint',C);
  17. class = svmclassify(fitsvm,X);
  18. trerr = sum(class~=Y)/length(Y);
  19. class = svmclassify(fitsvm,Xs);
  20. tserr = sum(class~=Ys)/length(Ys);
  21. [trerr tserr]
  22. fitsvm


  23. %plot some of the support vectors
  24. colormap(gray)
  25. for i=1:9
  26.     subplot(3,3,i)
  27.     imagesc(-flipud(rot90(reshape(fitsvm.SupportVectors(i,:),16,16))))
  28.     axis off
  29. end



  30. %compare test error to other methods

  31. [classNB,errNB] = classify(Xs,X,Y,'diaglinear');
  32. tserrNB = sum(classNB~=Ys)/length(Ys);
  33. [classLDA,errLDA] = classify(Xs,X,Y,'linear');
  34. tserrLDA = sum(classLDA~=Ys)/length(Ys);
  35. [tserrNB tserrLDA tserr]




  36. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  37. %non-linear 2-class SVM example

  38. var = 1.25;
  39. n = 100; mu1 = [0; 0]; cov1 = [1 .8; 1 .8];
  40. x1 = repmat(mu1',n/2,1) + randn(n/2,2)*chol(cov1)*sqrt(var);
  41. xs2 = repmat(mu1',n*100,1) + randn(n*100,2)*sqrt(var*3);
  42. ind = sum((xs2.^2)')' > 5;
  43. x2 = xs2(ind,:);
  44. x2 = x2(1:(n/2),:);
  45. x = [x1; x2];
  46. Y = [repmat(1,50,1); repmat(-1,50,1)];

  47. clf
  48. hold on
  49. scatter(x1(:,1),x1(:,2),'or','filled')
  50. scatter(x2(:,1),x2(:,2),'ob','filled')
  51. axis([min(min(x(:,1))) max(max(x(:,1))) min(min(x(:,2))) max(max(x(:,2)))]);
  52. hold off


  53. %linear kernel
  54. C = 1;
  55. svms = svmtrain(x,Y,'kernel_function','linear','showplot','true','method','QP','boxconstraint',C);

  56. %radial kernel
  57. sig = .5;
  58. C = .1;
  59. svms = svmtrain(x,Y,'kernel_function','rbf','showplot','true','method','QP','rbf_sigma',sig,'boxconstraint',C);

  60. %polynomial kernel
  61. d = 2;
  62. C = 1;
  63. svms = svmtrain(x,Y,'kernel_function','polynomial','showplot','true','method','QP','polyorder',d,'boxconstraint',C);




  64. %%%%%%%%%%%%%%%%%%%%%%%
  65. %non-linear regression example

  66. n = 200; p = 1;
  67. x = rand(n,1)*10;
  68. y = sin(x)*4 + randn(n,1);

  69. scatter(x,y,'filled')

  70. %linear
  71. betals = inv(x'*x)*x'*y;
  72. hold on
  73. scatter(x,y,'filled')
  74. xs = linspace(0,10,500);
  75. plot(xs,xs*betals,'k')
  76. hold off

  77. %polynomial
  78. d = 2; lam = 5;
  79. k = kern(x,[],ones(p,1),'poly1',d);
  80. alpha = inv(k + lam*eye(n))*y;
  81. clf
  82. hold on
  83. scatter(x,y,'filled')
  84. xs = linspace(0,10,500);
  85. kt = kern(x,xs',ones(p,1),'poly1',d);
  86. plot(xs,kt*alpha,'b')
  87. hold off

  88. %gaussian
  89. d = 1; lam = .01;
  90. k = kern(x,[],ones(p,1),'gaussian',d);
  91. alpha = inv(k + lam*eye(n))*y;
  92. clf
  93. hold on
  94. scatter(x,y,'filled')
  95. xs = linspace(0,10,500);
  96. kt = kern(x,xs',ones(p,1),'gaussian',d);
  97. plot(xs,kt*alpha,'b')
  98. hold off
复制代码

13
Lisrelchen(未真实交易用户) 发表于 2016-5-29 22:26:46
  1. ##################
  2. #lecture 14 code

  3. require(glmnet)
  4. require(Hmisc)

  5. #model complexity & prediction error

  6. n = 100
  7. p = 50
  8. Btrue = matrix(0,p,1)
  9. Btrue[1:10] = rnorm(10)*1;

  10. Xtr = scale(matrix(rnorm(n*p),n,p))
  11. Ytr = Xtr%*%Btrue + matrix(rnorm(n),n,1)
  12. Xts = scale(matrix(rnorm(n*p),n,p))
  13. Yts = Xts%*%Btrue + matrix(rnorm(n),n,1)

  14. fit = glmnet(x=Xtr,y=Ytr,family="gaussian",standardize=FALSE,nlambda=50,lambda.min.ratio=.001)
  15. Yhtr = predict(fit,newx=Xtr)
  16. MSEtr = apply((Yhtr-Ytr%*%matrix(1,1,length(fit$lambda)))^2,2,mean)
  17. Yhts = predict(fit,newx=Xts)
  18. MSEts = apply((Yhts-Yts%*%matrix(1,1,length(fit$lambda)))^2,2,mean)

  19. plot(1:length(fit$lambda),MSEtr,type="l",col=4,xlab="Sparsity",ylab="Error")
  20. lines(1:length(fit$lambda),MSEtr,col=4)
  21. lines(1:length(fit$lambda),MSEts,col=2)
  22. legend(30,5,legend=c("Training Error","Test Error"),col=c(4,2),lty=c(1,1),cex=.75)

  23. t(t(fit$beta[,15]))


  24. #Cross-Validation (by hand)
  25. #Model Selection

  26. fold = 5
  27. sam = sample(1:n,n)
  28. CVerrs = NULL
  29. for(i in 1:fold)
  30. {
  31.   ind = sam[((i-1)*n/fold + 1):(i*n/fold)]
  32.   Xin = Xtr[-ind,]; Yin = Ytr[-ind]
  33.   Xout = Xtr[ind,]; Yout = Ytr[ind]
  34.   fit = glmnet(x=Xin,y=Yin,family="gaussian",standardize=FALSE,nlambda=50,lambda.min.ratio=.001)
  35.   Yh = predict(fit,newx=Xout)
  36.   CVerrs = cbind(CVerrs,apply((Yh-Yout%*%matrix(1,1,length(fit$lambda)))^2,2,mean))
  37. }
  38. CVerr = apply(CVerrs,1,mean)

  39. #minimum CV error rule
  40. lines(1:length(fit$lambda),CVerr,col=1)
  41. optlam = fit$lambda[which.min(CVerr)]

  42. #one SE rule
  43. SE = sqrt(apply(CVerrs,1,var)/fold)
  44. errbar(1:length(fit$lambda),CVerr,CVerr+SE,CVerr-SE,add=TRUE)
  45. minSE = CVerr[which.min(CVerr)]+SE[which.min(CVerr)]
  46. minSEx = which(CVerr<minSE)[1]
  47. optlam = fit$lambda[minSEx]


  48. #refit to training data at optimal lambda
  49. fit = glmnet(x=Xtr,y=Ytr,family="gaussian",standardize=FALSE,lambda=optlam)
  50. Yhtr = predict(fit,newx=Xtr)
  51. TRerr = mean( (Yhtr - Ytr)^2 )
  52. Yhts = predict(fit,newx=Xts)
  53. TSerr = mean( (Yhts - Yts)^2 )

  54. optlam
  55. sum(fit$beta!=0)
  56. #why doesn't CV find the "true" model?

  57. TRerr
  58. TSerr
复制代码

14
Lisrelchen(未真实交易用户) 发表于 2016-5-29 22:28:13
  1. #############
  2. #lecture 16 - PCA

  3. ncidat = read.table("ncidata.txt")
  4. labs = read.table("ncilabels.txt")
  5. colnames(ncidat) = t(labs)

  6. dim(ncidat)
  7. unique(colnames(ncidat))

  8. #PCA - take SVD to get solution
  9. #center genes, but don't scale
  10. X = t(scale(t(ncidat),center=TRUE,scale=FALSE))
  11. sv = svd(t(X));
  12. U = sv$u
  13. V = sv$v
  14. D = sv$d
  15. Z = t(X)%*%V;

  16. #PC scatterplots
  17. cols = as.numeric(as.factor(colnames(ncidat)))
  18. K = 3
  19. pclabs = c("PC1","PC2","PC3","PC4")
  20. par(mfrow=c(1,K))
  21. for(i in 1:K){
  22.   j = i+1
  23.   plot(U[,i],U[,j],type="n",xlab=pclabs[i],ylab=pclabs[j])
  24.   text(U[,i],U[,j],colnames(X),col=cols)
  25. }

  26. #PC loadings - visualize data by limiting to top genes in magnitude in the PC loadings
  27. aa = grep("grey",colors())
  28. bb = grep("green",colors())
  29. cc = grep("red",colors())
  30. gcol2 = colors()[c(aa[1:30],bb[1:20],rep(cc,2))]

  31. j = 2
  32. ord = order(abs(V[,j]),decreasing=TRUE)
  33. x = as.matrix(X[ord[1:250],])
  34. heatmap(x,col=gcol2)

  35. #Variance Explained
  36. varex = 0
  37. cumvar = 0
  38. denom = sum(D^2)
  39. for(i in 1:64){
  40.   varex[i] = D[i]^2/denom
  41.   cumvar[i] = sum(D[1:i]^2)/denom
  42. }

  43. #screeplot
  44. par(mfrow=c(1,2))
  45. plot(1:64,varex,type="l",lwd=2,xlab="PC",ylab="% Variance Explained")
  46. plot(1:64,cumvar,type="l",lwd=2,xlab="PC",ylab="Cummulative Variance Explained")

  47. #######
  48. #Sparse PCA
  49. require("PMA")

  50. spc = SPC(t(X),sumabsv=10,K=4)

  51. #how many genes selected?
  52. apply(spc$v!=0,2,sum)

  53. #PC scatterplots
  54. cols = as.numeric(as.factor(colnames(ncidat)))
  55. K = 3
  56. pclabs = c("SPC1","SPC2","SPC3","SPC4")
  57. par(mfrow=c(1,K))
  58. for(i in 1:K){
  59.   j = i+1
  60.   plot(spc$u[,i],spc$u[,j],type="n",xlab=pclabs[i],ylab=pclabs[j])
  61.   text(spc$u[,i],spc$u[,j],colnames(X),col=cols)
  62. }

  63. #SPC loadings - visualize data by limiting to gene selected by the sparse PC loadings
  64. aa = grep("grey",colors())
  65. bb = grep("green",colors())
  66. cc = grep("red",colors())
  67. gcol2 = colors()[c(aa[1:30],bb[1:20],rep(cc,2))]

  68. j = 1
  69. ind = which(spc$v[,j]!=0)
  70. x = as.matrix(X[ind,])
  71. heatmap(x,col=gcol2)

  72. #variance explained
  73. spc$prop.var.explained
复制代码

15
Lisrelchen(未真实交易用户) 发表于 2016-5-29 22:29:46
  1. #######################
  2. #lecutre 17 code
  3. #comparing dimension reduction methods


  4. load("UnsupL.Rdata")
  5. imagedigit = function(vec){
  6.   mat = matrix(-vec[256:1],16,16)
  7.   mat = apply(mat,2,rev)
  8.   image(mat,col=gray((0:64)/64),xaxt="n",yaxt="n",ann=FALSE)
  9. }


  10. #pull out 3's
  11. dat3 = digits[which(rownames(digits)==3),]

  12. #visulaize
  13. par(mfrow=c(3,4))
  14. for(i in 1:12){
  15.   imagedigit(dat3[i,])
  16. }

  17. #PCA - take SVD to get solution
  18. #don't center and scale to retain interpretation as images
  19. svd3 = svd(dat3)
  20. U = svd3$u
  21. V = svd3$v #PC loadings
  22. D = svd3$d
  23. Z = dat3%*%V #PCs

  24. #PC scatterplot
  25. par(mfrow=c(1,1))
  26. plot(Z[,2],Z[,3],pch=16)

  27. #PC loadings
  28. par(mfrow=c(1,4))
  29. for(i in 1:4){
  30.   imagedigit(V[,i])
  31. }

  32. #Variance Explained
  33. varex = 0
  34. cumvar = 0
  35. denom = sum(D^2)
  36. for(i in 1:256){
  37.   varex[i] = D[i]^2/denom
  38.   cumvar[i] = sum(D[1:i]^2)/denom
  39. }

  40. #screeplot
  41. par(mfrow=c(1,2))
  42. plot(1:256,varex,type="l",lwd=2,xlab="PC",ylab="% Variance Explained")
  43. plot(1:256,cumvar,type="l",lwd=2,xlab="PC",ylab="Cummulative Variance Explained")


  44. cumvar[25] #first 25 PCs explain over 90% of variance
  45. pdat3 = dat3%*%V[,1:25] #projected data - a tenth of the original size


  46. #######
  47. #now NMF

  48. require("NMF")

  49. K = 15
  50. nmffit = nmf(dat3+1,rank=K)
  51. W = basis(nmffit)
  52. H = coef(nmffit)

  53. #plot archetypes - try changing K
  54. par(mfrow=c(3,5))
  55. for(i in 1:K){
  56.   imagedigit(H[i,])
  57. }


  58. ###########
  59. #now ICA

  60. require("fastICA")

  61. K = 15
  62. icafit = fastICA(t(dat3),n.comp=K)

  63. #plot independent source signals - try changing K
  64. par(mfrow=c(3,5))
  65. for(i in 1:K){
  66.   imagedigit(icafit$S[,i])
  67. }
复制代码

16
Lisrelchen(未真实交易用户) 发表于 2016-5-29 22:30:31
  1. ##################
  2. #lecture 18 - K-means clustering


  3. #simulate data
  4. n = 300
  5. mu1 = c(3,3); mu2 = c(7,4); mu3 = c(6,5.5);
  6. Sig = matrix(c(1,.5,.5,1),2,2)
  7. x1 = t(matrix(mu1,2,n/3)) + matrix(rnorm(n),n/3,2)
  8. xx = matrix(rnorm(n*2/3),n/3,2)
  9. x2 = t(matrix(mu2,2,n/3)) + xx%*%chol(Sig)
  10. xx = matrix(rnorm(n*2/3),n/3,2)
  11. x3 = t(matrix(mu3,2,n/3)) + xx%*%chol(Sig)
  12. x = rbind(x1,x2,x3)
  13. Y = c(rep(1,n/3),rep(2,n/3),rep(3,n/3))
  14. y = factor(Y)

  15. plot(x[,1],x[,2],col=as.numeric(y),pch=16)


  16. #try changing k - which clustering looks best?
  17. k = 3
  18. km = kmeans(x,centers=k)
  19. plot(x[,1],x[,2],col=km$cluster,pch=16)
  20. cens = km$centers
  21. points(cens[,1],cens[,2],col=1:k,pch=16,cex=3)


  22. #code to understand K-means algorithm
  23. require("animation")
  24. mv.kmeans = function(x,k,cens=NULL){
  25.   n = nrow(x)
  26.   if(is.null(cens)){
  27.       cens = x[sample(1:n,k),]
  28.     }
  29.   plot(x[,1],x[,2],pch=16)
  30.   points(cens[,1],cens[,2],col=1:k,pch=16,cex=3)
  31.   thr = 1e-6; ind = 1; iter = 1;
  32.   while( ind>thr)
  33.     {
  34.       oldcen = cens
  35.       km = kmeans(x,centers=cens,iter.max=1,nstart=1,algorithm="MacQueen")
  36.       plot(x[,1],x[,2],col=km$cluster,pch=16)
  37.       points(cens[,1],cens[,2],col=1:k,pch=16,cex=3)
  38.       cens = km$centers
  39.       #print(cens)
  40.       plot(x[,1],x[,2],col=km$cluster,pch=16)
  41.       points(cens[,1],cens[,2],col=1:k,pch=16,cex=3)
  42.       ind = sum(diag((oldcen-cens)%*%t(oldcen-cens)))
  43.       #print(ind)
  44.     }
  45. }

  46. #watch K-means algorithm movie
  47. #start from random starting points
  48. saveHTML(mv.kmeans(x,3,cens=NULL),img.name="km2015")





  49. ############
  50. #K-means for high-dimensional data

  51. require("ISLR")

  52. ncidat = t(NCI60$data)
  53. colnames(ncidat) = NCI60$labs

  54. dim(ncidat)
  55. unique(colnames(ncidat))


  56. K = 9
  57. km = kmeans(t(ncidat),centers=K)

  58. #how do we visualize K-means results?

  59. #PCA - take SVD to get solution
  60. #center genes, but don't scale
  61. X = t(scale(t(ncidat),center=TRUE,scale=FALSE))
  62. sv = svd(t(X));
  63. U = sv$u
  64. V = sv$v
  65. D = sv$d
  66. Z = t(X)%*%V;

  67. plot(Z[,1],Z[,2],col=km$cluster,type="n")
  68. text(Z[,1],Z[,2],colnames(ncidat),cex=.75,col=km$cluster)
  69. cens = km$centers
  70. points(cens%*%V[,1],cens%*%V[,2],col=1:K,pch=16,cex=3)

  71. #Re-run and see if solution changes
  72. K = 9
  73. km = kmeans(t(ncidat),centers=K)
  74. plot(Z[,1],Z[,2],col=km$cluster,type="n")
  75. text(Z[,1],Z[,2],colnames(ncidat),cex=.75,col=km$cluster)
  76. cens = km$centers
  77. points(cens%*%V[,1],cens%*%V[,2],col=1:K,pch=16,cex=3)

  78. #try different K
  79. K = 5
  80. km = kmeans(t(ncidat),centers=K)
  81. plot(Z[,1],Z[,2],col=km$cluster,type="n")
  82. text(Z[,1],Z[,2],colnames(ncidat),cex=.75,col=km$cluster)
  83. cens = km$centers
  84. points(cens%*%V[,1],cens%*%V[,2],col=1:K,pch=16,cex=3)
复制代码

17
Lisrelchen(未真实交易用户) 发表于 2016-5-29 22:30:59
  1. #################
  2. #lecture 19 code
  3. #hierarchical clustering

  4. require("ISLR")

  5. ncidat = t(NCI60$data)
  6. colnames(ncidat) = NCI60$labs

  7. dim(ncidat)
  8. unique(colnames(ncidat))

  9. #complete linakge - Euclidean distance
  10. cols = as.numeric(as.factor(colnames(ncidat)))
  11. Dmat = dist(t(ncidat))
  12. com.hclust = hclust(Dmat,method="complete")
  13. plot(com.hclust,cex=.7,main="Complete Linkage")

  14. #single linakge
  15. dev.new()
  16. sing.hclust = hclust(Dmat,method="single")
  17. plot(sing.hclust,cex=.7,main="Single Linkage")

  18. #average linakge
  19. dev.new()
  20. ave.hclust = hclust(Dmat,method="average")
  21. plot(ave.hclust,cex=.7,main="Average Linkage")

  22. #Ward's linakge
  23. dev.new()
  24. ward.hclust = hclust(Dmat,method="ward.D")
  25. plot(ward.hclust,cex=.7,main="Ward's Linkage")

  26. #complete linkage with different distances
  27. dev.new()
  28. Dmat = dist(t(ncidat),method="manhattan") #L1 distance
  29. com.hclust = hclust(Dmat,method="complete")
  30. plot(com.hclust,cex=.7,main="Complete Linkage - L1 Dist")


  31. ##########
  32. #Biclustering - Cluster Heatmap

  33. require("ISLR")
  34. ncidat = t(NCI60$data)
  35. colnames(ncidat) = NCI60$labs

  36. #filter genes using PCA
  37. X = t(scale(t(ncidat),center=TRUE,scale=FALSE))
  38. sv = svd(t(X));
  39. V = sv$v

  40. #PC loadings - visualize data by limiting to top genes in magnitude in the PC loadings
  41. aa = grep("grey",colors())
  42. bb = grep("green",colors())
  43. cc = grep("red",colors())
  44. gcol2 = colors()[c(aa[1:30],bb[1:20],rep(cc,2))]

  45. j = 2
  46. ord = order(abs(V[,j]),decreasing=TRUE)
  47. x = as.matrix(X[ord[1:250],])

  48. #cluster heatmap - uses Ward's linkage (complete is default)
  49. heatmap(x,col=gcol2,hclustfun=function(x)hclust(x,method="ward.D"))

  50. ######################################################
复制代码

18
Lisrelchen(未真实交易用户) 发表于 2016-5-29 22:32:01
  1. ##################
  2. #lecture 20 code  - graphical models

  3. require("diagram")
  4. require("glasso")

  5. data = as.matrix(read.table("sachs_data.txt"))
  6. labs = c("praf",
  7. "pmek",
  8. "plcg",
  9. "PIP2",
  10. "PIP3",
  11. "p44/42",
  12. "pakts473",
  13. "PKA",
  14. "PKC",
  15. "P38",
  16. "pjnk")
  17. xc = scale(log(data-min(data)+1),center=TRUE,scale=FALSE)
  18. n = nrow(data)


  19. par(mfrow=c(2,3))
  20. lams = c(.0001, .00025, .0005, .00075, .001, .0025)
  21. for(i in 1:6){
  22.   gl = glasso(t(xc)%*%xc/n,rho=lams[i])
  23.   plotweb(gl$wi,legend=FALSE,length=0,names=labs,main=lams[i])
  24. }
  25.   
复制代码

19
Lisrelchen(未真实交易用户) 发表于 2016-5-29 22:32:58
  1. ###############
  2. #lecture 22 - CART
  3. #classification tree example

  4. require(rpart)

  5. #reading in data
  6. data = read.csv("spam_dat.csv",header=FALSE)
  7. ndat = read.delim("spam_vars.txt",header=FALSE)

  8. #parsing variable names
  9. nams = NULL
  10. for(i in 1:nrow(ndat))
  11. {
  12.   vec = strsplit(as.character(ndat[i,]),split="_")
  13.   for(j in 1:length(vec[[1]]))
  14.     {
  15.       if(length(grep(":",vec[[1]][j]))>0)
  16.         {
  17.           vars = strsplit(vec[[1]][j],split=":")
  18.           nams = c(nams,vars[[1]][1])
  19.         }
  20.     }
  21. }

  22. Y = data[,58]
  23. n = length(Y)
  24. sum(Y)/n
  25. X = as.matrix(log(1 + data[,1:57]))
  26. colnames(X) = nams
  27. X = scale(X)/sqrt(n-1)
  28. Yf = as.factor(Y)
  29. levels(Yf) = c("not","spam")
  30. dat = data.frame(Yf,X)


  31. #high CP
  32. ans = rpart(Yf~.,data=dat,method="class",xval=5,cp=.1)
  33. plot(ans,margin=.05)
  34. text(ans,use.n=TRUE)

  35. summary(ans)

  36. plotcp(ans)

  37. nams[49:54]


  38. #medium cp
  39. ans = rpart(Yf~.,data=dat,method="class",xval=5,cp=.01)
  40. plot(ans,margin=.05)
  41. text(ans,use.n=TRUE)

  42. summary(ans)

  43. plotcp(ans)


  44. #low cp
  45. ans = rpart(Yf~.,data=dat,method="class",xval=5,cp=.005)
  46. plot(ans,margin=.05)
  47. text(ans,use.n=TRUE,cex=.75)

  48. summary(ans)

  49. plotcp(ans)


  50. ###############
  51. #splitting data into training and testing


  52. trind = sample(1:n,floor(n/2))
  53. trdat = data.frame(Yf[trind],X[trind,])
  54. names(trdat)[1] = "Yf"
  55. tsdat = data.frame(Yf[-trind],X[-trind,])
  56. names(tsdat)[1] = "Yf"


  57. ans = rpart(Yf~.,data=tsdat,method="class",xval=5,cp=.005)

  58. #trees

  59. ans = rpart(Yf~.,data=trdat,method="class",xval=5,cp=.005)
  60. trerr = sum(predict(ans,newdata=trdat,type="class")!=trdat$Yf)/nrow(trdat)
  61. tserr = sum(predict(ans,newdata=tsdat,type="class")!=tsdat$Yf)/nrow(tsdat)
  62. c(trerr, tserr)
复制代码

20
Lisrelchen(未真实交易用户) 发表于 2016-5-29 22:33:33
  1. ####################
  2. #lecture 23 - boosting

  3. require(rpart)

  4. #reading in data
  5. data = read.csv("spam_dat.csv",header=FALSE)
  6. ndat = read.delim("spam_vars.txt",header=FALSE)

  7. #parsing variable names
  8. nams = NULL
  9. for(i in 1:nrow(ndat))
  10. {
  11.   vec = strsplit(as.character(ndat[i,]),split="_")
  12.   for(j in 1:length(vec[[1]]))
  13.     {
  14.       if(length(grep(":",vec[[1]][j]))>0)
  15.         {
  16.           vars = strsplit(vec[[1]][j],split=":")
  17.           nams = c(nams,vars[[1]][1])
  18.         }
  19.     }
  20. }

  21. Y = data[,58]
  22. n = length(Y)
  23. sum(Y)/n
  24. X = as.matrix(log(1 + data[,1:57]))
  25. colnames(X) = nams
  26. X = scale(X)/sqrt(n-1)
  27. ind = sample(1:nrow(X),floor(nrow(X)*.3))
  28. xts = X[ind,]; xtr = X[-ind,]
  29. yts = Y[ind]; ytr = Y[-ind]
  30. trdat = data.frame(ytr,xtr)
  31. tsdat = data.frame(yts,xts)


  32. #trees
  33. fit1 = rpart(ytr~.,data=trdat,method="class",xval=5,cp=.01)
  34. trpred = apply(predict(fit1),1,which.max) - 1
  35. tree.trerr = sum(abs(ytr - trpred))/length(ytr)
  36. tspred = apply(predict(fit1,newdata=data.frame(xts)),1,which.max) - 1
  37. tree.tserr = sum(abs(yts - tspred))/length(yts)
  38. tree.err = c(tree.trerr, tree.tserr)
  39. tree.err


  40. #logistic regression
  41. fit2 = glm(ytr~.,data=trdat,family="binomial")
  42. trpred = (sign(predict(fit2)) + 1)/2
  43. logit.trerr = sum(abs(ytr - trpred))/length(ytr)
  44. tspred = (sign(predict(fit2,newdata=data.frame(xts))) + 1)/2
  45. logit.tserr = sum(abs(yts - tspred))/length(yts)
  46. logit.err = c(logit.trerr, logit.tserr)
  47. logit.err


  48. #######boosting
  49. require(ada)

  50. #adaboost
  51. fit3 = ada(x=xtr,y=ytr,loss="exponential",type="discrete",iter=150)
  52. ab.trerr = (fit3$confusion[2,1] + fit3$confusion[1,2])/length(ytr)
  53. plot(fit3)
  54. tspred = round(predict(fit3,newdata=data.frame(xts),type="probs"))
  55. ab.tserr = sum(abs(yts - (apply(tspred,1,which.max)-1)))/length(yts)
  56. ab.err = c(ab.trerr, ab.tserr)
  57. ab.err

  58. #logitboost
  59. fit4 = ada(x=xtr,y=ytr,loss="logistic",type="real",iter=150)
  60. lb.trerr = (fit4$confusion[2,1] + fit4$confusion[1,2])/length(ytr)
  61. plot(fit4)
  62. tspred = round(predict(fit4,newdata=data.frame(xts),type="probs"))
  63. lb.tserr = sum(abs(yts - (apply(tspred,1,which.max)-1)))/length(yts)
  64. lb.err = c(lb.trerr, lb.tserr)
  65. lb.err

  66. errmat = cbind(tree.err,logit.err,ab.err,lb.err)
  67. errmat
复制代码

您需要登录后才可以回帖 登录 | 我要注册

本版微信群
加好友,备注jltj
拉您入交流群
GMT+8, 2026-1-1 07:18