目次
データマイニング
- 6月1日にお願いしたアンケートはこちらです。ご協力宜しくお願いします。
- 6月22日に出題したレポート課題は次の通りです。締め切りは7月6日です。西5号館3階のJ専攻事務室前に教員宛のポストがあります。上から3段目、右から2つめの「山本」というポストに投函してください。
- 自分の研究もしくは自分の研究室の先輩の研究を説明し、その研究の有効性や他の研究と比較して優越性を示すためのデータと比較基準を紹介してください。データの選定方法(もしくは指定の経緯)と、比較基準の意味を合わせて説明してください。
- 講義で紹介した、予測誤差の推定という考え方を自分の研究に照らし、適用可能かを論じなさい。適用可能でなければ、そもそも自分の研究には予測誤差という概念が不要かどうかについて、適用可能であれば、どのように適用できるかについて、それぞれ併せて論じなさい。
- 7月27日は講義をしないと宣言しましたが、授業評価アンケートを実施しないといけないので、一応やります。ただし課題の説明だけに留めますので、10時からとさせてください。また出席できない人は、同じ内容をここにも掲載しますので、それをご参照ください。
およその講義内容の振り返りをしてもらいます。下記のコードを動かし、次のように分類しなさい。
線形・加法に近い手法 | 非線形な手法 | より複雑な手法 | |
軽い手法(時間がかからない) | |||
重い手法(時間がかかる) |
Rのインストール
Rの処理系はRプロジェクトが配布しているRが本家で、近年はこれと完全なバイナリ互換を実現しているMicrosoft R Openもある。後者の方が、マルチスレッドに対応しているので、コア数に余裕があるマシンを使用している場合には、後者を勧める。
またRStudioを併用することも勧める。上記のいずれかのRをインストールしたのちに、RStudioをインストールするだけでよく、設定は不要である。
macOS Sierraになると、コマンドラインツールが消されるようだ。install.packages()を実行して、xcrunがないという趣旨のエラーが表示されたら、ターミナルで次のコマンドを実行すると良い。
sudo xcode-select --install
時間の計測手段
動作時間は、任意の行の間に Sys.time() という関数を実行すると、表示できる。
time.begin <- Sys.time() b <- 0 for( i in c(1:100000) ) { b <- b+1 } time.end <- Sys.time() print(time.end-time.begin)
のように実行文を挟むと、間の時間を秒数で返してくれる。
以下、演習用のコード
# Chapter 2 Lab: Introduction to R # Basic Commands x <- c(1,3,2,5) x x = c(1,6,2) x y = c(1,4,3) length(x) length(y) x+y ls() rm(x,y) ls() rm(list=ls()) ?matrix x=matrix(data=c(1,2,3,4), nrow=2, ncol=2) x x=matrix(c(1,2,3,4),2,2) matrix(c(1,2,3,4),2,2,byrow=TRUE) sqrt(x) x^2 x=rnorm(50) y=x+rnorm(50,mean=50,sd=.1) cor(x,y) set.seed(1303) rnorm(50) set.seed(3) y=rnorm(100) mean(y) var(y) sqrt(var(y)) sd(y) # Graphics x=rnorm(100) y=rnorm(100) plot(x,y) plot(x,y,xlab="this is the x-axis",ylab="this is the y-axis",main="Plot of X vs Y") pdf("Figure.pdf") plot(x,y,col="green") dev.off() x=seq(1,10) x x=1:10 x x=seq(-pi,pi,length=50) y=x f=outer(x,y,function(x,y)cos(y)/(1+x^2)) contour(x,y,f) contour(x,y,f,nlevels=45,add=T) fa=(f-t(f))/2 contour(x,y,fa,nlevels=15) image(x,y,fa) persp(x,y,fa) persp(x,y,fa,theta=30) persp(x,y,fa,theta=30,phi=20) persp(x,y,fa,theta=30,phi=70) persp(x,y,fa,theta=30,phi=40) # Indexing Data A=matrix(1:16,4,4) A A[2,3] A[c(1,3),c(2,4)] A[1:3,2:4] A[1:2,] A[,1:2] A[1,] A[-c(1,3),] A[-c(1,3),-c(1,3,4)] dim(A) # Loading Data Auto=read.table("http://www-bcf.usc.edu/~gareth/ISL/Auto.data") fix(Auto) Auto=read.table("http://www-bcf.usc.edu/~gareth/ISL/Auto.data",header=T,na.strings="?") fix(Auto) Auto=read.csv("http://www-bcf.usc.edu/~gareth/ISL/Auto.csv",header=T,na.strings="?") fix(Auto) dim(Auto) Auto[1:4,] Auto=na.omit(Auto) dim(Auto) names(Auto) # Additional Graphical and Numerical Summaries plot(cylinders, mpg) plot(Auto$cylinders, Auto$mpg) attach(Auto) plot(cylinders, mpg) cylinders=as.factor(cylinders) plot(cylinders, mpg) plot(cylinders, mpg, col="red") plot(cylinders, mpg, col="red", varwidth=T) plot(cylinders, mpg, col="red", varwidth=T,horizontal=T) plot(cylinders, mpg, col="red", varwidth=T, xlab="cylinders", ylab="MPG") hist(mpg) hist(mpg,col=2) hist(mpg,col=2,breaks=15) pairs(Auto) pairs(~ mpg + displacement + horsepower + weight + acceleration, Auto) plot(horsepower,mpg) identify(horsepower,mpg,name) summary(Auto) summary(mpg) # Chapter 3 Lab: Linear Regression library(MASS) install.packages("ISLR",dependencies=TRUE) library(ISLR) # Simple Linear Regression fix(Boston) names(Boston) lm.fit=lm(medv~lstat) lm.fit=lm(medv~lstat,data=Boston) attach(Boston) lm.fit=lm(medv~lstat) lm.fit summary(lm.fit) names(lm.fit) coef(lm.fit) confint(lm.fit) predict(lm.fit,data.frame(lstat=(c(5,10,15))), interval="confidence") predict(lm.fit,data.frame(lstat=(c(5,10,15))), interval="prediction") plot(lstat,medv) abline(lm.fit) abline(lm.fit,lwd=3) abline(lm.fit,lwd=3,col="red") plot(lstat,medv,col="red") plot(lstat,medv,pch=20) plot(lstat,medv,pch="+") plot(1:20,1:20,pch=1:20) par(mfrow=c(2,2)) plot(lm.fit) plot(predict(lm.fit), residuals(lm.fit)) plot(predict(lm.fit), rstudent(lm.fit)) plot(hatvalues(lm.fit)) which.max(hatvalues(lm.fit)) # Multiple Linear Regression lm.fit=lm(medv~lstat+age,data=Boston) summary(lm.fit) lm.fit=lm(medv~.,data=Boston) summary(lm.fit) library(car) vif(lm.fit) lm.fit1=lm(medv~.-age,data=Boston) summary(lm.fit1) lm.fit1=update(lm.fit, ~.-age) # Interaction Terms summary(lm(medv~lstat*age,data=Boston)) # Non-linear Transformations of the Predictors lm.fit2=lm(medv~lstat+I(lstat^2)) summary(lm.fit2) lm.fit=lm(medv~lstat) anova(lm.fit,lm.fit2) par(mfrow=c(2,2)) plot(lm.fit2) lm.fit5=lm(medv~poly(lstat,5)) summary(lm.fit5) summary(lm(medv~log(rm),data=Boston)) # Qualitative Predictors fix(Carseats) names(Carseats) lm.fit=lm(Sales~.+Income:Advertising+Price:Age,data=Carseats) summary(lm.fit) attach(Carseats) contrasts(ShelveLoc) # Writing Functions LoadLibraries LoadLibraries() LoadLibraries=function(){ library(ISLR) library(MASS) print("The libraries have been loaded.") } LoadLibraries LoadLibraries() # Chapter 4 Lab: Logistic Regression, LDA, QDA, and KNN # The Stock Market Data library(ISLR) names(Smarket) dim(Smarket) summary(Smarket) pairs(Smarket) cor(Smarket) cor(Smarket[,-9]) attach(Smarket) plot(Volume) # Logistic Regression glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial) summary(glm.fit) coef(glm.fit) summary(glm.fit)$coef summary(glm.fit)$coef[,4] glm.probs=predict(glm.fit,type="response") glm.probs[1:10] contrasts(Direction) glm.pred=rep("Down",1250) glm.pred[glm.probs>.5]="Up" table(glm.pred,Direction) (507+145)/1250 mean(glm.pred==Direction) train=(Year<2005) Smarket.2005=Smarket[!train,] dim(Smarket.2005) Direction.2005=Direction[!train] glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial,subset=train) glm.probs=predict(glm.fit,Smarket.2005,type="response") glm.pred=rep("Down",252) glm.pred[glm.probs>.5]="Up" table(glm.pred,Direction.2005) mean(glm.pred==Direction.2005) mean(glm.pred!=Direction.2005) glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,family=binomial,subset=train) glm.probs=predict(glm.fit,Smarket.2005,type="response") glm.pred=rep("Down",252) glm.pred[glm.probs>.5]="Up" table(glm.pred,Direction.2005) mean(glm.pred==Direction.2005) 106/(106+76) predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="response") # Linear Discriminant Analysis library(MASS) lda.fit=lda(Direction~Lag1+Lag2,data=Smarket,subset=train) lda.fit plot(lda.fit) lda.pred=predict(lda.fit, Smarket.2005) names(lda.pred) lda.class=lda.pred$class table(lda.class,Direction.2005) mean(lda.class==Direction.2005) sum(lda.pred$posterior[,1]>=.5) sum(lda.pred$posterior[,1]<.5) lda.pred$posterior[1:20,1] lda.class[1:20] sum(lda.pred$posterior[,1]>.9) # Quadratic Discriminant Analysis qda.fit=qda(Direction~Lag1+Lag2,data=Smarket,subset=train) qda.fit qda.class=predict(qda.fit,Smarket.2005)$class table(qda.class,Direction.2005) mean(qda.class==Direction.2005) # K-Nearest Neighbors library(class) train.X=cbind(Lag1,Lag2)[train,] test.X=cbind(Lag1,Lag2)[!train,] train.Direction=Direction[train] set.seed(1) knn.pred=knn(train.X,test.X,train.Direction,k=1) table(knn.pred,Direction.2005) (83+43)/252 knn.pred=knn(train.X,test.X,train.Direction,k=3) table(knn.pred,Direction.2005) mean(knn.pred==Direction.2005) # An Application to Caravan Insurance Data dim(Caravan) attach(Caravan) summary(Purchase) 348/5822 standardized.X=scale(Caravan[,-86]) var(Caravan[,1]) var(Caravan[,2]) var(standardized.X[,1]) var(standardized.X[,2]) test=1:1000 train.X=standardized.X[-test,] test.X=standardized.X[test,] train.Y=Purchase[-test] test.Y=Purchase[test] set.seed(1) knn.pred=knn(train.X,test.X,train.Y,k=1) mean(test.Y!=knn.pred) mean(test.Y!="No") table(knn.pred,test.Y) 9/(68+9) knn.pred=knn(train.X,test.X,train.Y,k=3) table(knn.pred,test.Y) 5/26 knn.pred=knn(train.X,test.X,train.Y,k=5) table(knn.pred,test.Y) 4/15 glm.fit=glm(Purchase~.,data=Caravan,family=binomial,subset=-test) glm.probs=predict(glm.fit,Caravan[test,],type="response") glm.pred=rep("No",1000) glm.pred[glm.probs>.5]="Yes" table(glm.pred,test.Y) glm.pred=rep("No",1000) glm.pred[glm.probs>.25]="Yes" table(glm.pred,test.Y) 11/(22+11) # Chaper 5 Lab: Cross-Validation and the Bootstrap # The Validation Set Approach library(ISLR) set.seed(1) train=sample(392,196) lm.fit=lm(mpg~horsepower,data=Auto,subset=train) attach(Auto) mean((mpg-predict(lm.fit,Auto))[-train]^2) lm.fit2=lm(mpg~poly(horsepower,2),data=Auto,subset=train) mean((mpg-predict(lm.fit2,Auto))[-train]^2) lm.fit3=lm(mpg~poly(horsepower,3),data=Auto,subset=train) mean((mpg-predict(lm.fit3,Auto))[-train]^2) set.seed(2) train=sample(392,196) lm.fit=lm(mpg~horsepower,subset=train) mean((mpg-predict(lm.fit,Auto))[-train]^2) lm.fit2=lm(mpg~poly(horsepower,2),data=Auto,subset=train) mean((mpg-predict(lm.fit2,Auto))[-train]^2) lm.fit3=lm(mpg~poly(horsepower,3),data=Auto,subset=train) mean((mpg-predict(lm.fit3,Auto))[-train]^2) # Leave-One-Out Cross-Validation glm.fit=glm(mpg~horsepower,data=Auto) coef(glm.fit) lm.fit=lm(mpg~horsepower,data=Auto) coef(lm.fit) library(boot) glm.fit=glm(mpg~horsepower,data=Auto) cv.err=cv.glm(Auto,glm.fit) cv.err$delta cv.error=rep(0,5) for (i in 1:5){ glm.fit=glm(mpg~poly(horsepower,i),data=Auto) cv.error[i]=cv.glm(Auto,glm.fit)$delta[1] } cv.error # k-Fold Cross-Validation set.seed(17) cv.error.10=rep(0,10) for (i in 1:10){ glm.fit=glm(mpg~poly(horsepower,i),data=Auto) cv.error.10[i]=cv.glm(Auto,glm.fit,K=10)$delta[1] } cv.error.10 # The Bootstrap alpha.fn=function(data,index){ X=data$X[index] Y=data$Y[index] return((var(Y)-cov(X,Y))/(var(X)+var(Y)-2*cov(X,Y))) } alpha.fn(Portfolio,1:100) set.seed(1) alpha.fn(Portfolio,sample(100,100,replace=T)) boot(Portfolio,alpha.fn,R=1000) # Estimating the Accuracy of a Linear Regression Model boot.fn=function(data,index) return(coef(lm(mpg~horsepower,data=data,subset=index))) boot.fn(Auto,1:392) set.seed(1) boot.fn(Auto,sample(392,392,replace=T)) boot.fn(Auto,sample(392,392,replace=T)) boot(Auto,boot.fn,1000) summary(lm(mpg~horsepower,data=Auto))$coef boot.fn=function(data,index) coefficients(lm(mpg~horsepower+I(horsepower^2),data=data,subset=index)) set.seed(1) boot(Auto,boot.fn,1000) summary(lm(mpg~horsepower+I(horsepower^2),data=Auto))$coef # Chapter 6 Lab 1: Subset Selection Methods # Best Subset Selection library(ISLR) fix(Hitters) names(Hitters) dim(Hitters) sum(is.na(Hitters$Salary)) Hitters=na.omit(Hitters) dim(Hitters) sum(is.na(Hitters)) install.packages("leaps",dependencies=TRUE) library(leaps) regfit.full=regsubsets(Salary~.,Hitters) summary(regfit.full) regfit.full=regsubsets(Salary~.,data=Hitters,nvmax=19) reg.summary=summary(regfit.full) names(reg.summary) reg.summary$rsq par(mfrow=c(2,2)) plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l") plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l") which.max(reg.summary$adjr2) points(11,reg.summary$adjr2[11], col="red",cex=2,pch=20) plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l') which.min(reg.summary$cp) points(10,reg.summary$cp[10],col="red",cex=2,pch=20) which.min(reg.summary$bic) plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l') points(6,reg.summary$bic[6],col="red",cex=2,pch=20) plot(regfit.full,scale="r2") plot(regfit.full,scale="adjr2") plot(regfit.full,scale="Cp") plot(regfit.full,scale="bic") coef(regfit.full,6) # Forward and Backward Stepwise Selection regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="forward") summary(regfit.fwd) regfit.bwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="backward") summary(regfit.bwd) coef(regfit.full,7) coef(regfit.fwd,7) coef(regfit.bwd,7) # Choosing Among Models set.seed(1) train=sample(c(TRUE,FALSE), nrow(Hitters),rep=TRUE) test=(!train) regfit.best=regsubsets(Salary~.,data=Hitters[train,],nvmax=19) test.mat=model.matrix(Salary~.,data=Hitters[test,]) val.errors=rep(NA,19) for(i in 1:19){ coefi=coef(regfit.best,id=i) pred=test.mat[,names(coefi)]%*%coefi val.errors[i]=mean((Hitters$Salary[test]-pred)^2) } val.errors which.min(val.errors) coef(regfit.best,10) predict.regsubsets=function(object,newdata,id,...){ form=as.formula(object$call[[2]]) mat=model.matrix(form,newdata) coefi=coef(object,id=id) xvars=names(coefi) mat[,xvars]%*%coefi } regfit.best=regsubsets(Salary~.,data=Hitters,nvmax=19) coef(regfit.best,10) k=10 set.seed(1) folds=sample(1:k,nrow(Hitters),replace=TRUE) cv.errors=matrix(NA,k,19, dimnames=list(NULL, paste(1:19))) for(j in 1:k){ best.fit=regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19) for(i in 1:19){ pred=predict(best.fit,Hitters[folds==j,],id=i) cv.errors[j,i]=mean( (Hitters$Salary[folds==j]-pred)^2) } } mean.cv.errors=apply(cv.errors,2,mean) mean.cv.errors par(mfrow=c(1,1)) plot(mean.cv.errors,type='b') reg.best=regsubsets(Salary~.,data=Hitters, nvmax=19) coef(reg.best,11) # Chapter 6 Lab 2: Ridge Regression and the Lasso x=model.matrix(Salary~.,Hitters)[,-1] y=Hitters$Salary # Ridge Regression install.packages("glmnet",dependencies=TRUE) library(glmnet) grid=10^seq(10,-2,length=100) ridge.mod=glmnet(x,y,alpha=0,lambda=grid) dim(coef(ridge.mod)) ridge.mod$lambda[50] coef(ridge.mod)[,50] sqrt(sum(coef(ridge.mod)[-1,50]^2)) ridge.mod$lambda[60] coef(ridge.mod)[,60] sqrt(sum(coef(ridge.mod)[-1,60]^2)) predict(ridge.mod,s=50,type="coefficients")[1:20,] set.seed(1) train=sample(1:nrow(x), nrow(x)/2) test=(-train) y.test=y[test] ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12) ridge.pred=predict(ridge.mod,s=4,newx=x[test,]) mean((ridge.pred-y.test)^2) mean((mean(y[train])-y.test)^2) ridge.pred=predict(ridge.mod,s=1e10,newx=x[test,]) mean((ridge.pred-y.test)^2) ridge.pred=predict(ridge.mod,s=0,newx=x[test,],exact=T) mean((ridge.pred-y.test)^2) lm(y~x, subset=train) predict(ridge.mod,s=0,exact=T,type="coefficients")[1:20,] set.seed(1) cv.out=cv.glmnet(x[train,],y[train],alpha=0) plot(cv.out) bestlam=cv.out$lambda.min bestlam ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,]) mean((ridge.pred-y.test)^2) out=glmnet(x,y,alpha=0) predict(out,type="coefficients",s=bestlam)[1:20,] # The Lasso lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid) plot(lasso.mod) set.seed(1) cv.out=cv.glmnet(x[train,],y[train],alpha=1) plot(cv.out) bestlam=cv.out$lambda.min lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,]) mean((lasso.pred-y.test)^2) out=glmnet(x,y,alpha=1,lambda=grid) lasso.coef=predict(out,type="coefficients",s=bestlam)[1:20,] lasso.coef lasso.coef[lasso.coef!=0] # Chapter 6 Lab 3: PCR and PLS Regression # Principal Components Regression library(pls) set.seed(2) pcr.fit=pcr(Salary~., data=Hitters,scale=TRUE,validation="CV") summary(pcr.fit) validationplot(pcr.fit,val.type="MSEP") set.seed(1) pcr.fit=pcr(Salary~., data=Hitters,subset=train,scale=TRUE, validation="CV") validationplot(pcr.fit,val.type="MSEP") pcr.pred=predict(pcr.fit,x[test,],ncomp=7) mean((pcr.pred-y.test)^2) pcr.fit=pcr(y~x,scale=TRUE,ncomp=7) summary(pcr.fit) # Partial Least Squares set.seed(1) pls.fit=plsr(Salary~., data=Hitters,subset=train,scale=TRUE, validation="CV") summary(pls.fit) validationplot(pls.fit,val.type="MSEP") pls.pred=predict(pls.fit,x[test,],ncomp=2) mean((pls.pred-y.test)^2) pls.fit=plsr(Salary~., data=Hitters,scale=TRUE,ncomp=2) summary(pls.fit) # Chapter 7 Lab: Non-linear Modeling library(ISLR) attach(Wage) # Polynomial Regression and Step Functions fit=lm(wage~poly(age,4),data=Wage) coef(summary(fit)) fit2=lm(wage~poly(age,4,raw=T),data=Wage) coef(summary(fit2)) fit2a=lm(wage~age+I(age^2)+I(age^3)+I(age^4),data=Wage) coef(fit2a) fit2b=lm(wage~cbind(age,age^2,age^3,age^4),data=Wage) agelims=range(age) age.grid=seq(from=agelims[1],to=agelims[2]) preds=predict(fit,newdata=list(age=age.grid),se=TRUE) se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit) par(mfrow=c(1,2),mar=c(4.5,4.5,1,1),oma=c(0,0,4,0)) plot(age,wage,xlim=agelims,cex=.5,col="darkgrey") title("Degree-4 Polynomial",outer=T) lines(age.grid,preds$fit,lwd=2,col="blue") matlines(age.grid,se.bands,lwd=1,col="blue",lty=3) preds2=predict(fit2,newdata=list(age=age.grid),se=TRUE) max(abs(preds$fit-preds2$fit)) fit.1=lm(wage~age,data=Wage) fit.2=lm(wage~poly(age,2),data=Wage) fit.3=lm(wage~poly(age,3),data=Wage) fit.4=lm(wage~poly(age,4),data=Wage) fit.5=lm(wage~poly(age,5),data=Wage) anova(fit.1,fit.2,fit.3,fit.4,fit.5) coef(summary(fit.5)) (-11.983)^2 fit.1=lm(wage~education+age,data=Wage) fit.2=lm(wage~education+poly(age,2),data=Wage) fit.3=lm(wage~education+poly(age,3),data=Wage) anova(fit.1,fit.2,fit.3) fit=glm(I(wage>250)~poly(age,4),data=Wage,family=binomial) preds=predict(fit,newdata=list(age=age.grid),se=T) pfit=exp(preds$fit)/(1+exp(preds$fit)) se.bands.logit = cbind(preds$fit+2*preds$se.fit, preds$fit-2*preds$se.fit) se.bands = exp(se.bands.logit)/(1+exp(se.bands.logit)) preds=predict(fit,newdata=list(age=age.grid),type="response",se=T) plot(age,I(wage>250),xlim=agelims,type="n",ylim=c(0,.2)) points(jitter(age), I((wage>250)/5),cex=.5,pch="|",col="darkgrey") lines(age.grid,pfit,lwd=2, col="blue") matlines(age.grid,se.bands,lwd=1,col="blue",lty=3) table(cut(age,4)) fit=lm(wage~cut(age,4),data=Wage) coef(summary(fit)) # Splines library(splines) fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage) pred=predict(fit,newdata=list(age=age.grid),se=T) plot(age,wage,col="gray") lines(age.grid,pred$fit,lwd=2) lines(age.grid,pred$fit+2*pred$se,lty="dashed") lines(age.grid,pred$fit-2*pred$se,lty="dashed") dim(bs(age,knots=c(25,40,60))) dim(bs(age,df=6)) attr(bs(age,df=6),"knots") fit2=lm(wage~ns(age,df=4),data=Wage) pred2=predict(fit2,newdata=list(age=age.grid),se=T) lines(age.grid, pred2$fit,col="red",lwd=2) plot(age,wage,xlim=agelims,cex=.5,col="darkgrey") title("Smoothing Spline") fit=smooth.spline(age,wage,df=16) fit2=smooth.spline(age,wage,cv=TRUE) fit2$df lines(fit,col="red",lwd=2) lines(fit2,col="blue",lwd=2) legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex=.8) plot(age,wage,xlim=agelims,cex=.5,col="darkgrey") title("Local Regression") fit=loess(wage~age,span=.2,data=Wage) fit2=loess(wage~age,span=.5,data=Wage) lines(age.grid,predict(fit,data.frame(age=age.grid)),col="red",lwd=2) lines(age.grid,predict(fit2,data.frame(age=age.grid)),col="blue",lwd=2) legend("topright",legend=c("Span=0.2","Span=0.5"),col=c("red","blue"),lty=1,lwd=2,cex=.8) # GAMs gam1=lm(wage~ns(year,4)+ns(age,5)+education,data=Wage) library(gam) gam.m3=gam(wage~s(year,4)+s(age,5)+education,data=Wage) par(mfrow=c(1,3)) plot(gam.m3, se=TRUE,col="blue") plot.gam(gam1, se=TRUE, col="red") gam.m1=gam(wage~s(age,5)+education,data=Wage) gam.m2=gam(wage~year+s(age,5)+education,data=Wage) anova(gam.m1,gam.m2,gam.m3,test="F") summary(gam.m3) preds=predict(gam.m2,newdata=Wage) gam.lo=gam(wage~s(year,df=4)+lo(age,span=0.7)+education,data=Wage) plot.gam(gam.lo, se=TRUE, col="green") gam.lo.i=gam(wage~lo(year,age,span=0.5)+education,data=Wage) library(akima) plot(gam.lo.i) gam.lr=gam(I(wage>250)~year+s(age,df=5)+education,family=binomial,data=Wage) par(mfrow=c(1,3)) plot(gam.lr,se=T,col="green") table(education,I(wage>250)) gam.lr.s=gam(I(wage>250)~year+s(age,df=5)+education,family=binomial,data=Wage,subset=(education!="1. < HS Grad")) plot(gam.lr.s,se=T,col="green") # Chapter 8 Lab: Decision Trees # Fitting Classification Trees library(tree) library(ISLR) attach(Carseats) High=ifelse(Sales<=8,"No","Yes") Carseats=data.frame(Carseats,High) tree.carseats=tree(High~.-Sales,Carseats) summary(tree.carseats) plot(tree.carseats) text(tree.carseats,pretty=0) tree.carseats set.seed(2) train=sample(1:nrow(Carseats), 200) Carseats.test=Carseats[-train,] High.test=High[-train] tree.carseats=tree(High~.-Sales,Carseats,subset=train) tree.pred=predict(tree.carseats,Carseats.test,type="class") table(tree.pred,High.test) (86+57)/200 set.seed(3) cv.carseats=cv.tree(tree.carseats,FUN=prune.misclass) names(cv.carseats) cv.carseats par(mfrow=c(1,2)) plot(cv.carseats$size,cv.carseats$dev,type="b") plot(cv.carseats$k,cv.carseats$dev,type="b") prune.carseats=prune.misclass(tree.carseats,best=9) plot(prune.carseats) text(prune.carseats,pretty=0) tree.pred=predict(prune.carseats,Carseats.test,type="class") table(tree.pred,High.test) (94+60)/200 prune.carseats=prune.misclass(tree.carseats,best=15) plot(prune.carseats) text(prune.carseats,pretty=0) tree.pred=predict(prune.carseats,Carseats.test,type="class") table(tree.pred,High.test) (86+62)/200 # Fitting Regression Trees library(MASS) set.seed(1) train = sample(1:nrow(Boston), nrow(Boston)/2) tree.boston=tree(medv~.,Boston,subset=train) summary(tree.boston) plot(tree.boston) text(tree.boston,pretty=0) cv.boston=cv.tree(tree.boston) plot(cv.boston$size,cv.boston$dev,type='b') prune.boston=prune.tree(tree.boston,best=5) plot(prune.boston) text(prune.boston,pretty=0) yhat=predict(tree.boston,newdata=Boston[-train,]) boston.test=Boston[-train,"medv"] plot(yhat,boston.test) abline(0,1) mean((yhat-boston.test)^2) # Bagging and Random Forests install.packages("randomForest",dependencies=TRUE) library(randomForest) set.seed(1) bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,importance=TRUE) bag.boston yhat.bag = predict(bag.boston,newdata=Boston[-train,]) plot(yhat.bag, boston.test) abline(0,1) mean((yhat.bag-boston.test)^2) bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,ntree=25) yhat.bag = predict(bag.boston,newdata=Boston[-train,]) mean((yhat.bag-boston.test)^2) set.seed(1) rf.boston=randomForest(medv~.,data=Boston,subset=train,mtry=6,importance=TRUE) yhat.rf = predict(rf.boston,newdata=Boston[-train,]) mean((yhat.rf-boston.test)^2) importance(rf.boston) varImpPlot(rf.boston) # Boosting install.packages("gbm",dependencies=TRUE) library(gbm) set.seed(1) boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4) summary(boost.boston) par(mfrow=c(1,2)) plot(boost.boston,i="rm") plot(boost.boston,i="lstat") yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000) mean((yhat.boost-boston.test)^2) boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4,shrinkage=0.2,verbose=F) yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000) mean((yhat.boost-boston.test)^2) # Chapter 9 Lab: Support Vector Machines # Support Vector Classifier set.seed(1) x=matrix(rnorm(20*2), ncol=2) y=c(rep(-1,10), rep(1,10)) x[y==1,]=x[y==1,] + 1 plot(x, col=(3-y)) dat=data.frame(x=x, y=as.factor(y)) library(e1071) svmfit=svm(y~., data=dat, kernel="linear", cost=10,scale=FALSE) plot(svmfit, dat) svmfit$index summary(svmfit) svmfit=svm(y~., data=dat, kernel="linear", cost=0.1,scale=FALSE) plot(svmfit, dat) svmfit$index set.seed(1) tune.out=tune(svm,y~.,data=dat,kernel="linear",ranges=list(cost=c(0.001, 0.01, 0.1, 1,5,10,100))) summary(tune.out) bestmod=tune.out$best.model summary(bestmod) xtest=matrix(rnorm(20*2), ncol=2) ytest=sample(c(-1,1), 20, rep=TRUE) xtest[ytest==1,]=xtest[ytest==1,] + 1 testdat=data.frame(x=xtest, y=as.factor(ytest)) ypred=predict(bestmod,testdat) table(predict=ypred, truth=testdat$y) svmfit=svm(y~., data=dat, kernel="linear", cost=.01,scale=FALSE) ypred=predict(svmfit,testdat) table(predict=ypred, truth=testdat$y) x[y==1,]=x[y==1,]+0.5 plot(x, col=(y+5)/2, pch=19) dat=data.frame(x=x,y=as.factor(y)) svmfit=svm(y~., data=dat, kernel="linear", cost=1e5) summary(svmfit) plot(svmfit, dat) svmfit=svm(y~., data=dat, kernel="linear", cost=1) summary(svmfit) plot(svmfit,dat) # Support Vector Machine set.seed(1) x=matrix(rnorm(200*2), ncol=2) x[1:100,]=x[1:100,]+2 x[101:150,]=x[101:150,]-2 y=c(rep(1,150),rep(2,50)) dat=data.frame(x=x,y=as.factor(y)) plot(x, col=y) train=sample(200,100) svmfit=svm(y~., data=dat[train,], kernel="radial", gamma=1, cost=1) plot(svmfit, dat[train,]) summary(svmfit) svmfit=svm(y~., data=dat[train,], kernel="radial",gamma=1,cost=1e5) plot(svmfit,dat[train,]) set.seed(1) tune.out=tune(svm, y~., data=dat[train,], kernel="radial", ranges=list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4))) summary(tune.out) table(true=dat[-train,"y"], pred=predict(tune.out$best.model,newx=dat[-train,])) # ROC Curves install.packages("ROCR",dependencies=TRUE) library(ROCR) rocplot=function(pred, truth, ...){ predob = prediction(pred, truth) perf = performance(predob, "tpr", "fpr") plot(perf,...)} svmfit.opt=svm(y~., data=dat[train,], kernel="radial",gamma=2, cost=1,decision.values=T) fitted=attributes(predict(svmfit.opt,dat[train,],decision.values=TRUE))$decision.values par(mfrow=c(1,2)) rocplot(fitted,dat[train,"y"],main="Training Data") svmfit.flex=svm(y~., data=dat[train,], kernel="radial",gamma=50, cost=1, decision.values=T) fitted=attributes(predict(svmfit.flex,dat[train,],decision.values=T))$decision.values rocplot(fitted,dat[train,"y"],add=T,col="red") fitted=attributes(predict(svmfit.opt,dat[-train,],decision.values=T))$decision.values rocplot(fitted,dat[-train,"y"],main="Test Data") fitted=attributes(predict(svmfit.flex,dat[-train,],decision.values=T))$decision.values rocplot(fitted,dat[-train,"y"],add=T,col="red") # SVM with Multiple Classes set.seed(1) x=rbind(x, matrix(rnorm(50*2), ncol=2)) y=c(y, rep(0,50)) x[y==0,2]=x[y==0,2]+2 dat=data.frame(x=x, y=as.factor(y)) par(mfrow=c(1,1)) plot(x,col=(y+1)) svmfit=svm(y~., data=dat, kernel="radial", cost=10, gamma=1) plot(svmfit, dat) # Application to Gene Expression Data library(ISLR) names(Khan) dim(Khan$xtrain) dim(Khan$xtest) length(Khan$ytrain) length(Khan$ytest) table(Khan$ytrain) table(Khan$ytest) dat=data.frame(x=Khan$xtrain, y=as.factor(Khan$ytrain)) out=svm(y~., data=dat, kernel="linear",cost=10) summary(out) table(out$fitted, dat$y) dat.te=data.frame(x=Khan$xtest, y=as.factor(Khan$ytest)) pred.te=predict(out, newdata=dat.te) table(pred.te, dat.te$y) # Chapter 10 Lab 1: Principal Components Analysis states=row.names(USArrests) states names(USArrests) apply(USArrests, 2, mean) apply(USArrests, 2, var) pr.out=prcomp(USArrests, scale=TRUE) names(pr.out) pr.out$center pr.out$scale pr.out$rotation dim(pr.out$x) biplot(pr.out, scale=0) pr.out$rotation=-pr.out$rotation pr.out$x=-pr.out$x biplot(pr.out, scale=0) pr.out$sdev pr.var=pr.out$sdev^2 pr.var pve=pr.var/sum(pr.var) pve plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b') plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b') a=c(1,2,8,-3) cumsum(a) # Chapter 10 Lab 2: Clustering # K-Means Clustering set.seed(2) x=matrix(rnorm(50*2), ncol=2) x[1:25,1]=x[1:25,1]+3 x[1:25,2]=x[1:25,2]-4 km.out=kmeans(x,2,nstart=20) km.out$cluster plot(x, col=(km.out$cluster+1), main="K-Means Clustering Results with K=2", xlab="", ylab="", pch=20, cex=2) set.seed(4) km.out=kmeans(x,3,nstart=20) km.out plot(x, col=(km.out$cluster+1), main="K-Means Clustering Results with K=3", xlab="", ylab="", pch=20, cex=2) set.seed(3) km.out=kmeans(x,3,nstart=1) km.out$tot.withinss km.out=kmeans(x,3,nstart=20) km.out$tot.withinss # Hierarchical Clustering hc.complete=hclust(dist(x), method="complete") hc.average=hclust(dist(x), method="average") hc.single=hclust(dist(x), method="single") par(mfrow=c(1,3)) plot(hc.complete,main="Complete Linkage", xlab="", sub="", cex=.9) plot(hc.average, main="Average Linkage", xlab="", sub="", cex=.9) plot(hc.single, main="Single Linkage", xlab="", sub="", cex=.9) cutree(hc.complete, 2) cutree(hc.average, 2) cutree(hc.single, 2) cutree(hc.single, 4) xsc=scale(x) plot(hclust(dist(xsc), method="complete"), main="Hierarchical Clustering with Scaled Features") x=matrix(rnorm(30*3), ncol=3) dd=as.dist(1-cor(t(x))) plot(hclust(dd, method="complete"), main="Complete Linkage with Correlation-Based Distance", xlab="", sub="") # Chapter 10 Lab 3: NCI60 Data Example # The NCI60 data library(ISLR) nci.labs=NCI60$labs nci.data=NCI60$data dim(nci.data) nci.labs[1:4] table(nci.labs) # PCA on the NCI60 Data pr.out=prcomp(nci.data, scale=TRUE) Cols=function(vec){ cols=rainbow(length(unique(vec))) return(cols[as.numeric(as.factor(vec))]) } par(mfrow=c(1,2)) plot(pr.out$x[,1:2], col=Cols(nci.labs), pch=19,xlab="Z1",ylab="Z2") plot(pr.out$x[,c(1,3)], col=Cols(nci.labs), pch=19,xlab="Z1",ylab="Z3") summary(pr.out) plot(pr.out) pve=100*pr.out$sdev^2/sum(pr.out$sdev^2) par(mfrow=c(1,2)) plot(pve, type="o", ylab="PVE", xlab="Principal Component", col="blue") plot(cumsum(pve), type="o", ylab="Cumulative PVE", xlab="Principal Component", col="brown3") # Clustering the Observations of the NCI60 Data sd.data=scale(nci.data) par(mfrow=c(1,3)) data.dist=dist(sd.data) plot(hclust(data.dist), labels=nci.labs, main="Complete Linkage", xlab="", sub="",ylab="") plot(hclust(data.dist, method="average"), labels=nci.labs, main="Average Linkage", xlab="", sub="",ylab="") plot(hclust(data.dist, method="single"), labels=nci.labs, main="Single Linkage", xlab="", sub="",ylab="") hc.out=hclust(dist(sd.data)) hc.clusters=cutree(hc.out,4) table(hc.clusters,nci.labs) par(mfrow=c(1,1)) plot(hc.out, labels=nci.labs) abline(h=139, col="red") hc.out set.seed(2) km.out=kmeans(sd.data, 4, nstart=20) km.clusters=km.out$cluster table(km.clusters,hc.clusters) hc.out=hclust(dist(pr.out$x[,1:5])) plot(hc.out, labels=nci.labs, main="Hier. Clust. on First Five Score Vectors") table(cutree(hc.out,4), nci.labs)