差分
このページの2つのバージョン間の差分を表示します。
次のリビジョン | 前のリビジョン次のリビジョン両方とも次のリビジョン | ||
dm:2017 [2017/06/05 16:38] – created watalu | dm:2017 [2017/07/27 10:26] – watalu | ||
---|---|---|---|
行 1: | 行 1: | ||
- | == データマイニング == | + | ==== データマイニング |
* 6月1日にお願いしたアンケートは[[https:// | * 6月1日にお願いしたアンケートは[[https:// | ||
+ | * 6月22日に出題したレポート課題は次の通りです。締め切りは7月6日です。西5号館3階のJ専攻事務室前に教員宛のポストがあります。上から3段目、右から2つめの「山本」というポストに投函してください。 | ||
+ | * 自分の研究もしくは自分の研究室の先輩の研究を説明し、その研究の有効性や他の研究と比較して優越性を示すためのデータと比較基準を紹介してください。データの選定方法(もしくは指定の経緯)と、比較基準の意味を合わせて説明してください。 | ||
+ | * 講義で紹介した、予測誤差の推定という考え方を自分の研究に照らし、適用可能かを論じなさい。適用可能でなければ、そもそも自分の研究には予測誤差という概念が不要かどうかについて、適用可能であれば、どのように適用できるかについて、それぞれ併せて論じなさい。 | ||
+ | * 7月27日は講義をしないと宣言しましたが、授業評価アンケートを実施しないといけないので、一応やります。ただし課題の説明だけに留めますので、10時からとさせてください。また出席できない人は、同じ内容をここにも掲載しますので、それをご参照ください。 | ||
+ | |||
+ | およその講義内容の振り返りをしてもらいます。下記のコードを動かし、次のように分類しなさい。 | ||
+ | |||
+ | | |線形・加法に近い手法|非線形な手法|より複雑な手法| | ||
+ | |軽い手法(時間がかからない)| | | | | ||
+ | |重い手法(時間がかかる)| | | | | ||
+ | |||
+ | == Rのインストール == | ||
+ | |||
+ | Rの処理系は[[http:// | ||
+ | |||
+ | また[[https:// | ||
+ | |||
+ | == 時間の計測手段 == | ||
+ | |||
+ | 動作時間は、任意の行の間に Sys.time() という関数を実行すると、表示できる。 | ||
+ | < | ||
+ | time.begin <- Sys.time() | ||
+ | b <- 0 | ||
+ | for( i in c(1:100000) ) { | ||
+ | b <- b+1 | ||
+ | } | ||
+ | time.end <- Sys.time() | ||
+ | print(time.end-time.begin) | ||
+ | </ | ||
+ | のように実行文を挟むと、間の時間を秒数で返してくれる。 | ||
+ | |||
+ | |||
+ | == 以下、演習用のコード == | ||
+ | |||
+ | < | ||
+ | # Chapter 2 Lab: Introduction to R | ||
+ | |||
+ | # Basic Commands | ||
+ | |||
+ | x <- c(1,3,2,5) | ||
+ | x | ||
+ | x = c(1,6,2) | ||
+ | x | ||
+ | y = c(1,4,3) | ||
+ | length(x) | ||
+ | length(y) | ||
+ | x+y | ||
+ | ls() | ||
+ | rm(x,y) | ||
+ | ls() | ||
+ | rm(list=ls()) | ||
+ | ?matrix | ||
+ | x=matrix(data=c(1, | ||
+ | x | ||
+ | x=matrix(c(1, | ||
+ | matrix(c(1, | ||
+ | sqrt(x) | ||
+ | x^2 | ||
+ | x=rnorm(50) | ||
+ | y=x+rnorm(50, | ||
+ | cor(x,y) | ||
+ | set.seed(1303) | ||
+ | rnorm(50) | ||
+ | set.seed(3) | ||
+ | y=rnorm(100) | ||
+ | mean(y) | ||
+ | var(y) | ||
+ | sqrt(var(y)) | ||
+ | sd(y) | ||
+ | |||
+ | # Graphics | ||
+ | |||
+ | x=rnorm(100) | ||
+ | y=rnorm(100) | ||
+ | plot(x,y) | ||
+ | plot(x, | ||
+ | pdf(" | ||
+ | plot(x, | ||
+ | dev.off() | ||
+ | x=seq(1,10) | ||
+ | x | ||
+ | x=1:10 | ||
+ | x | ||
+ | x=seq(-pi, | ||
+ | y=x | ||
+ | f=outer(x, | ||
+ | contour(x, | ||
+ | contour(x, | ||
+ | fa=(f-t(f))/ | ||
+ | contour(x, | ||
+ | image(x, | ||
+ | persp(x, | ||
+ | persp(x, | ||
+ | persp(x, | ||
+ | persp(x, | ||
+ | persp(x, | ||
+ | |||
+ | # Indexing Data | ||
+ | |||
+ | A=matrix(1: | ||
+ | A | ||
+ | A[2,3] | ||
+ | A[c(1, | ||
+ | A[1:3,2:4] | ||
+ | A[1:2,] | ||
+ | A[,1:2] | ||
+ | A[1,] | ||
+ | A[-c(1,3),] | ||
+ | A[-c(1, | ||
+ | dim(A) | ||
+ | |||
+ | # Loading Data | ||
+ | |||
+ | Auto=read.table(" | ||
+ | fix(Auto) | ||
+ | Auto=read.table(" | ||
+ | fix(Auto) | ||
+ | Auto=read.csv(" | ||
+ | fix(Auto) | ||
+ | dim(Auto) | ||
+ | Auto[1:4,] | ||
+ | Auto=na.omit(Auto) | ||
+ | dim(Auto) | ||
+ | names(Auto) | ||
+ | |||
+ | # Additional Graphical and Numerical Summaries | ||
+ | |||
+ | plot(cylinders, | ||
+ | plot(Auto$cylinders, | ||
+ | attach(Auto) | ||
+ | plot(cylinders, | ||
+ | cylinders=as.factor(cylinders) | ||
+ | plot(cylinders, | ||
+ | plot(cylinders, | ||
+ | plot(cylinders, | ||
+ | plot(cylinders, | ||
+ | plot(cylinders, | ||
+ | hist(mpg) | ||
+ | hist(mpg, | ||
+ | hist(mpg, | ||
+ | pairs(Auto) | ||
+ | pairs(~ mpg + displacement + horsepower + weight + acceleration, | ||
+ | plot(horsepower, | ||
+ | identify(horsepower, | ||
+ | summary(Auto) | ||
+ | summary(mpg) | ||
+ | |||
+ | |||
+ | |||
+ | # Chapter 3 Lab: Linear Regression | ||
+ | |||
+ | library(MASS) | ||
+ | install.packages(" | ||
+ | library(ISLR) | ||
+ | |||
+ | # Simple Linear Regression | ||
+ | |||
+ | fix(Boston) | ||
+ | names(Boston) | ||
+ | lm.fit=lm(medv~lstat) | ||
+ | lm.fit=lm(medv~lstat, | ||
+ | attach(Boston) | ||
+ | lm.fit=lm(medv~lstat) | ||
+ | lm.fit | ||
+ | summary(lm.fit) | ||
+ | names(lm.fit) | ||
+ | coef(lm.fit) | ||
+ | confint(lm.fit) | ||
+ | predict(lm.fit, | ||
+ | predict(lm.fit, | ||
+ | plot(lstat, | ||
+ | abline(lm.fit) | ||
+ | abline(lm.fit, | ||
+ | abline(lm.fit, | ||
+ | plot(lstat, | ||
+ | plot(lstat, | ||
+ | plot(lstat, | ||
+ | plot(1: | ||
+ | par(mfrow=c(2, | ||
+ | plot(lm.fit) | ||
+ | plot(predict(lm.fit), | ||
+ | plot(predict(lm.fit), | ||
+ | plot(hatvalues(lm.fit)) | ||
+ | which.max(hatvalues(lm.fit)) | ||
+ | |||
+ | # Multiple Linear Regression | ||
+ | |||
+ | lm.fit=lm(medv~lstat+age, | ||
+ | summary(lm.fit) | ||
+ | lm.fit=lm(medv~., | ||
+ | summary(lm.fit) | ||
+ | library(car) | ||
+ | vif(lm.fit) | ||
+ | lm.fit1=lm(medv~.-age, | ||
+ | summary(lm.fit1) | ||
+ | lm.fit1=update(lm.fit, | ||
+ | |||
+ | # Interaction Terms | ||
+ | |||
+ | summary(lm(medv~lstat*age, | ||
+ | |||
+ | # Non-linear Transformations of the Predictors | ||
+ | |||
+ | lm.fit2=lm(medv~lstat+I(lstat^2)) | ||
+ | summary(lm.fit2) | ||
+ | lm.fit=lm(medv~lstat) | ||
+ | anova(lm.fit, | ||
+ | par(mfrow=c(2, | ||
+ | plot(lm.fit2) | ||
+ | lm.fit5=lm(medv~poly(lstat, | ||
+ | summary(lm.fit5) | ||
+ | summary(lm(medv~log(rm), | ||
+ | |||
+ | # Qualitative Predictors | ||
+ | |||
+ | fix(Carseats) | ||
+ | names(Carseats) | ||
+ | lm.fit=lm(Sales~.+Income: | ||
+ | summary(lm.fit) | ||
+ | attach(Carseats) | ||
+ | contrasts(ShelveLoc) | ||
+ | |||
+ | # Writing Functions | ||
+ | |||
+ | LoadLibraries | ||
+ | LoadLibraries() | ||
+ | LoadLibraries=function(){ | ||
+ | library(ISLR) | ||
+ | library(MASS) | ||
+ | print(" | ||
+ | } | ||
+ | LoadLibraries | ||
+ | LoadLibraries() | ||
+ | |||
+ | |||
+ | # Chapter 4 Lab: Logistic Regression, LDA, QDA, and KNN | ||
+ | |||
+ | # The Stock Market Data | ||
+ | |||
+ | library(ISLR) | ||
+ | names(Smarket) | ||
+ | dim(Smarket) | ||
+ | summary(Smarket) | ||
+ | pairs(Smarket) | ||
+ | cor(Smarket) | ||
+ | cor(Smarket[, | ||
+ | attach(Smarket) | ||
+ | plot(Volume) | ||
+ | |||
+ | # Logistic Regression | ||
+ | |||
+ | glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, | ||
+ | summary(glm.fit) | ||
+ | coef(glm.fit) | ||
+ | summary(glm.fit)$coef | ||
+ | summary(glm.fit)$coef[, | ||
+ | glm.probs=predict(glm.fit, | ||
+ | glm.probs[1: | ||
+ | contrasts(Direction) | ||
+ | glm.pred=rep(" | ||
+ | glm.pred[glm.probs> | ||
+ | table(glm.pred, | ||
+ | (507+145)/ | ||
+ | mean(glm.pred==Direction) | ||
+ | train=(Year< | ||
+ | Smarket.2005=Smarket[!train, | ||
+ | dim(Smarket.2005) | ||
+ | Direction.2005=Direction[!train] | ||
+ | glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, | ||
+ | glm.probs=predict(glm.fit, | ||
+ | glm.pred=rep(" | ||
+ | glm.pred[glm.probs> | ||
+ | table(glm.pred, | ||
+ | mean(glm.pred==Direction.2005) | ||
+ | mean(glm.pred!=Direction.2005) | ||
+ | glm.fit=glm(Direction~Lag1+Lag2, | ||
+ | glm.probs=predict(glm.fit, | ||
+ | glm.pred=rep(" | ||
+ | glm.pred[glm.probs> | ||
+ | table(glm.pred, | ||
+ | mean(glm.pred==Direction.2005) | ||
+ | 106/ | ||
+ | predict(glm.fit, | ||
+ | |||
+ | # Linear Discriminant Analysis | ||
+ | |||
+ | library(MASS) | ||
+ | lda.fit=lda(Direction~Lag1+Lag2, | ||
+ | lda.fit | ||
+ | plot(lda.fit) | ||
+ | lda.pred=predict(lda.fit, | ||
+ | names(lda.pred) | ||
+ | lda.class=lda.pred$class | ||
+ | table(lda.class, | ||
+ | mean(lda.class==Direction.2005) | ||
+ | sum(lda.pred$posterior[, | ||
+ | sum(lda.pred$posterior[, | ||
+ | lda.pred$posterior[1: | ||
+ | lda.class[1: | ||
+ | sum(lda.pred$posterior[, | ||
+ | |||
+ | # Quadratic Discriminant Analysis | ||
+ | |||
+ | qda.fit=qda(Direction~Lag1+Lag2, | ||
+ | qda.fit | ||
+ | qda.class=predict(qda.fit, | ||
+ | table(qda.class, | ||
+ | mean(qda.class==Direction.2005) | ||
+ | |||
+ | # K-Nearest Neighbors | ||
+ | |||
+ | library(class) | ||
+ | train.X=cbind(Lag1, | ||
+ | test.X=cbind(Lag1, | ||
+ | train.Direction=Direction[train] | ||
+ | set.seed(1) | ||
+ | knn.pred=knn(train.X, | ||
+ | table(knn.pred, | ||
+ | (83+43)/252 | ||
+ | knn.pred=knn(train.X, | ||
+ | table(knn.pred, | ||
+ | mean(knn.pred==Direction.2005) | ||
+ | |||
+ | # An Application to Caravan Insurance Data | ||
+ | |||
+ | dim(Caravan) | ||
+ | attach(Caravan) | ||
+ | summary(Purchase) | ||
+ | 348/5822 | ||
+ | standardized.X=scale(Caravan[, | ||
+ | var(Caravan[, | ||
+ | var(Caravan[, | ||
+ | var(standardized.X[, | ||
+ | var(standardized.X[, | ||
+ | test=1:1000 | ||
+ | train.X=standardized.X[-test, | ||
+ | test.X=standardized.X[test, | ||
+ | train.Y=Purchase[-test] | ||
+ | test.Y=Purchase[test] | ||
+ | set.seed(1) | ||
+ | knn.pred=knn(train.X, | ||
+ | mean(test.Y!=knn.pred) | ||
+ | mean(test.Y!=" | ||
+ | table(knn.pred, | ||
+ | 9/(68+9) | ||
+ | knn.pred=knn(train.X, | ||
+ | table(knn.pred, | ||
+ | 5/26 | ||
+ | knn.pred=knn(train.X, | ||
+ | table(knn.pred, | ||
+ | 4/15 | ||
+ | glm.fit=glm(Purchase~., | ||
+ | glm.probs=predict(glm.fit, | ||
+ | glm.pred=rep(" | ||
+ | glm.pred[glm.probs> | ||
+ | table(glm.pred, | ||
+ | glm.pred=rep(" | ||
+ | glm.pred[glm.probs> | ||
+ | table(glm.pred, | ||
+ | 11/(22+11) | ||
+ | |||
+ | |||
+ | |||
+ | # Chaper 5 Lab: Cross-Validation and the Bootstrap | ||
+ | |||
+ | # The Validation Set Approach | ||
+ | |||
+ | library(ISLR) | ||
+ | set.seed(1) | ||
+ | train=sample(392, | ||
+ | lm.fit=lm(mpg~horsepower, | ||
+ | attach(Auto) | ||
+ | mean((mpg-predict(lm.fit, | ||
+ | lm.fit2=lm(mpg~poly(horsepower, | ||
+ | mean((mpg-predict(lm.fit2, | ||
+ | lm.fit3=lm(mpg~poly(horsepower, | ||
+ | mean((mpg-predict(lm.fit3, | ||
+ | set.seed(2) | ||
+ | train=sample(392, | ||
+ | lm.fit=lm(mpg~horsepower, | ||
+ | mean((mpg-predict(lm.fit, | ||
+ | lm.fit2=lm(mpg~poly(horsepower, | ||
+ | mean((mpg-predict(lm.fit2, | ||
+ | lm.fit3=lm(mpg~poly(horsepower, | ||
+ | mean((mpg-predict(lm.fit3, | ||
+ | |||
+ | # Leave-One-Out Cross-Validation | ||
+ | |||
+ | glm.fit=glm(mpg~horsepower, | ||
+ | coef(glm.fit) | ||
+ | lm.fit=lm(mpg~horsepower, | ||
+ | coef(lm.fit) | ||
+ | library(boot) | ||
+ | glm.fit=glm(mpg~horsepower, | ||
+ | cv.err=cv.glm(Auto, | ||
+ | cv.err$delta | ||
+ | cv.error=rep(0, | ||
+ | for (i in 1:5){ | ||
+ | glm.fit=glm(mpg~poly(horsepower, | ||
+ | cv.error[i]=cv.glm(Auto, | ||
+ | } | ||
+ | cv.error | ||
+ | |||
+ | # k-Fold Cross-Validation | ||
+ | |||
+ | set.seed(17) | ||
+ | cv.error.10=rep(0, | ||
+ | for (i in 1:10){ | ||
+ | glm.fit=glm(mpg~poly(horsepower, | ||
+ | cv.error.10[i]=cv.glm(Auto, | ||
+ | } | ||
+ | cv.error.10 | ||
+ | |||
+ | # The Bootstrap | ||
+ | |||
+ | alpha.fn=function(data, | ||
+ | X=data$X[index] | ||
+ | Y=data$Y[index] | ||
+ | return((var(Y)-cov(X, | ||
+ | } | ||
+ | alpha.fn(Portfolio, | ||
+ | set.seed(1) | ||
+ | alpha.fn(Portfolio, | ||
+ | boot(Portfolio, | ||
+ | |||
+ | # Estimating the Accuracy of a Linear Regression Model | ||
+ | |||
+ | boot.fn=function(data, | ||
+ | return(coef(lm(mpg~horsepower, | ||
+ | boot.fn(Auto, | ||
+ | set.seed(1) | ||
+ | boot.fn(Auto, | ||
+ | boot.fn(Auto, | ||
+ | boot(Auto, | ||
+ | summary(lm(mpg~horsepower, | ||
+ | boot.fn=function(data, | ||
+ | coefficients(lm(mpg~horsepower+I(horsepower^2), | ||
+ | set.seed(1) | ||
+ | boot(Auto, | ||
+ | summary(lm(mpg~horsepower+I(horsepower^2), | ||
+ | |||
+ | |||
+ | |||
+ | # Chapter 6 Lab 1: Subset Selection Methods | ||
+ | |||
+ | # Best Subset Selection | ||
+ | |||
+ | library(ISLR) | ||
+ | fix(Hitters) | ||
+ | names(Hitters) | ||
+ | dim(Hitters) | ||
+ | sum(is.na(Hitters$Salary)) | ||
+ | Hitters=na.omit(Hitters) | ||
+ | dim(Hitters) | ||
+ | sum(is.na(Hitters)) | ||
+ | |||
+ | install.packages(" | ||
+ | library(leaps) | ||
+ | regfit.full=regsubsets(Salary~., | ||
+ | summary(regfit.full) | ||
+ | regfit.full=regsubsets(Salary~., | ||
+ | reg.summary=summary(regfit.full) | ||
+ | names(reg.summary) | ||
+ | reg.summary$rsq | ||
+ | par(mfrow=c(2, | ||
+ | plot(reg.summary$rss, | ||
+ | plot(reg.summary$adjr2, | ||
+ | which.max(reg.summary$adjr2) | ||
+ | points(11, | ||
+ | plot(reg.summary$cp, | ||
+ | which.min(reg.summary$cp) | ||
+ | points(10, | ||
+ | which.min(reg.summary$bic) | ||
+ | plot(reg.summary$bic, | ||
+ | points(6, | ||
+ | plot(regfit.full, | ||
+ | plot(regfit.full, | ||
+ | plot(regfit.full, | ||
+ | plot(regfit.full, | ||
+ | coef(regfit.full, | ||
+ | |||
+ | # Forward and Backward Stepwise Selection | ||
+ | |||
+ | regfit.fwd=regsubsets(Salary~., | ||
+ | summary(regfit.fwd) | ||
+ | regfit.bwd=regsubsets(Salary~., | ||
+ | summary(regfit.bwd) | ||
+ | coef(regfit.full, | ||
+ | coef(regfit.fwd, | ||
+ | coef(regfit.bwd, | ||
+ | |||
+ | # Choosing Among Models | ||
+ | |||
+ | set.seed(1) | ||
+ | train=sample(c(TRUE, | ||
+ | test=(!train) | ||
+ | regfit.best=regsubsets(Salary~., | ||
+ | test.mat=model.matrix(Salary~., | ||
+ | val.errors=rep(NA, | ||
+ | for(i in 1:19){ | ||
+ | coefi=coef(regfit.best, | ||
+ | pred=test.mat[, | ||
+ | val.errors[i]=mean((Hitters$Salary[test]-pred)^2) | ||
+ | } | ||
+ | val.errors | ||
+ | which.min(val.errors) | ||
+ | coef(regfit.best, | ||
+ | predict.regsubsets=function(object, | ||
+ | form=as.formula(object$call[[2]]) | ||
+ | mat=model.matrix(form, | ||
+ | coefi=coef(object, | ||
+ | xvars=names(coefi) | ||
+ | mat[, | ||
+ | } | ||
+ | regfit.best=regsubsets(Salary~., | ||
+ | coef(regfit.best, | ||
+ | k=10 | ||
+ | set.seed(1) | ||
+ | folds=sample(1: | ||
+ | cv.errors=matrix(NA, | ||
+ | for(j in 1:k){ | ||
+ | best.fit=regsubsets(Salary~., | ||
+ | for(i in 1:19){ | ||
+ | pred=predict(best.fit, | ||
+ | cv.errors[j, | ||
+ | } | ||
+ | } | ||
+ | mean.cv.errors=apply(cv.errors, | ||
+ | mean.cv.errors | ||
+ | par(mfrow=c(1, | ||
+ | plot(mean.cv.errors, | ||
+ | reg.best=regsubsets(Salary~., | ||
+ | coef(reg.best, | ||
+ | |||
+ | |||
+ | # Chapter 6 Lab 2: Ridge Regression and the Lasso | ||
+ | |||
+ | x=model.matrix(Salary~., | ||
+ | y=Hitters$Salary | ||
+ | |||
+ | # Ridge Regression | ||
+ | |||
+ | install.packages(" | ||
+ | library(glmnet) | ||
+ | grid=10^seq(10, | ||
+ | ridge.mod=glmnet(x, | ||
+ | dim(coef(ridge.mod)) | ||
+ | ridge.mod$lambda[50] | ||
+ | coef(ridge.mod)[, | ||
+ | sqrt(sum(coef(ridge.mod)[-1, | ||
+ | ridge.mod$lambda[60] | ||
+ | coef(ridge.mod)[, | ||
+ | sqrt(sum(coef(ridge.mod)[-1, | ||
+ | predict(ridge.mod, | ||
+ | set.seed(1) | ||
+ | train=sample(1: | ||
+ | test=(-train) | ||
+ | y.test=y[test] | ||
+ | ridge.mod=glmnet(x[train, | ||
+ | ridge.pred=predict(ridge.mod, | ||
+ | mean((ridge.pred-y.test)^2) | ||
+ | mean((mean(y[train])-y.test)^2) | ||
+ | ridge.pred=predict(ridge.mod, | ||
+ | mean((ridge.pred-y.test)^2) | ||
+ | ridge.pred=predict(ridge.mod, | ||
+ | mean((ridge.pred-y.test)^2) | ||
+ | lm(y~x, subset=train) | ||
+ | predict(ridge.mod, | ||
+ | set.seed(1) | ||
+ | cv.out=cv.glmnet(x[train, | ||
+ | plot(cv.out) | ||
+ | bestlam=cv.out$lambda.min | ||
+ | bestlam | ||
+ | ridge.pred=predict(ridge.mod, | ||
+ | mean((ridge.pred-y.test)^2) | ||
+ | out=glmnet(x, | ||
+ | predict(out, | ||
+ | |||
+ | # The Lasso | ||
+ | |||
+ | lasso.mod=glmnet(x[train, | ||
+ | plot(lasso.mod) | ||
+ | set.seed(1) | ||
+ | cv.out=cv.glmnet(x[train, | ||
+ | plot(cv.out) | ||
+ | bestlam=cv.out$lambda.min | ||
+ | lasso.pred=predict(lasso.mod, | ||
+ | mean((lasso.pred-y.test)^2) | ||
+ | out=glmnet(x, | ||
+ | lasso.coef=predict(out, | ||
+ | lasso.coef | ||
+ | lasso.coef[lasso.coef!=0] | ||
+ | |||
+ | |||
+ | # Chapter 6 Lab 3: PCR and PLS Regression | ||
+ | |||
+ | # Principal Components Regression | ||
+ | |||
+ | library(pls) | ||
+ | set.seed(2) | ||
+ | pcr.fit=pcr(Salary~., | ||
+ | summary(pcr.fit) | ||
+ | validationplot(pcr.fit, | ||
+ | set.seed(1) | ||
+ | pcr.fit=pcr(Salary~., | ||
+ | validationplot(pcr.fit, | ||
+ | pcr.pred=predict(pcr.fit, | ||
+ | mean((pcr.pred-y.test)^2) | ||
+ | pcr.fit=pcr(y~x, | ||
+ | summary(pcr.fit) | ||
+ | |||
+ | # Partial Least Squares | ||
+ | |||
+ | set.seed(1) | ||
+ | pls.fit=plsr(Salary~., | ||
+ | summary(pls.fit) | ||
+ | validationplot(pls.fit, | ||
+ | pls.pred=predict(pls.fit, | ||
+ | mean((pls.pred-y.test)^2) | ||
+ | pls.fit=plsr(Salary~., | ||
+ | summary(pls.fit) | ||
+ | |||
+ | |||
+ | |||
+ | # Chapter 7 Lab: Non-linear Modeling | ||
+ | |||
+ | library(ISLR) | ||
+ | attach(Wage) | ||
+ | |||
+ | # Polynomial Regression and Step Functions | ||
+ | |||
+ | fit=lm(wage~poly(age, | ||
+ | coef(summary(fit)) | ||
+ | fit2=lm(wage~poly(age, | ||
+ | coef(summary(fit2)) | ||
+ | fit2a=lm(wage~age+I(age^2)+I(age^3)+I(age^4), | ||
+ | coef(fit2a) | ||
+ | fit2b=lm(wage~cbind(age, | ||
+ | agelims=range(age) | ||
+ | age.grid=seq(from=agelims[1], | ||
+ | preds=predict(fit, | ||
+ | se.bands=cbind(preds$fit+2*preds$se.fit, | ||
+ | par(mfrow=c(1, | ||
+ | plot(age, | ||
+ | title(" | ||
+ | lines(age.grid, | ||
+ | matlines(age.grid, | ||
+ | preds2=predict(fit2, | ||
+ | max(abs(preds$fit-preds2$fit)) | ||
+ | fit.1=lm(wage~age, | ||
+ | fit.2=lm(wage~poly(age, | ||
+ | fit.3=lm(wage~poly(age, | ||
+ | fit.4=lm(wage~poly(age, | ||
+ | fit.5=lm(wage~poly(age, | ||
+ | anova(fit.1, | ||
+ | coef(summary(fit.5)) | ||
+ | (-11.983)^2 | ||
+ | fit.1=lm(wage~education+age, | ||
+ | fit.2=lm(wage~education+poly(age, | ||
+ | fit.3=lm(wage~education+poly(age, | ||
+ | anova(fit.1, | ||
+ | fit=glm(I(wage> | ||
+ | preds=predict(fit, | ||
+ | pfit=exp(preds$fit)/ | ||
+ | se.bands.logit = cbind(preds$fit+2*preds$se.fit, | ||
+ | se.bands = exp(se.bands.logit)/ | ||
+ | preds=predict(fit, | ||
+ | plot(age, | ||
+ | points(jitter(age), | ||
+ | lines(age.grid, | ||
+ | matlines(age.grid, | ||
+ | table(cut(age, | ||
+ | fit=lm(wage~cut(age, | ||
+ | coef(summary(fit)) | ||
+ | |||
+ | # Splines | ||
+ | |||
+ | library(splines) | ||
+ | fit=lm(wage~bs(age, | ||
+ | pred=predict(fit, | ||
+ | plot(age, | ||
+ | lines(age.grid, | ||
+ | lines(age.grid, | ||
+ | lines(age.grid, | ||
+ | dim(bs(age, | ||
+ | dim(bs(age, | ||
+ | attr(bs(age, | ||
+ | fit2=lm(wage~ns(age, | ||
+ | pred2=predict(fit2, | ||
+ | lines(age.grid, | ||
+ | plot(age, | ||
+ | title(" | ||
+ | fit=smooth.spline(age, | ||
+ | fit2=smooth.spline(age, | ||
+ | fit2$df | ||
+ | lines(fit, | ||
+ | lines(fit2, | ||
+ | legend(" | ||
+ | plot(age, | ||
+ | title(" | ||
+ | fit=loess(wage~age, | ||
+ | fit2=loess(wage~age, | ||
+ | lines(age.grid, | ||
+ | lines(age.grid, | ||
+ | legend(" | ||
+ | |||
+ | # GAMs | ||
+ | |||
+ | gam1=lm(wage~ns(year, | ||
+ | library(gam) | ||
+ | gam.m3=gam(wage~s(year, | ||
+ | par(mfrow=c(1, | ||
+ | plot(gam.m3, | ||
+ | plot.gam(gam1, | ||
+ | gam.m1=gam(wage~s(age, | ||
+ | gam.m2=gam(wage~year+s(age, | ||
+ | anova(gam.m1, | ||
+ | summary(gam.m3) | ||
+ | preds=predict(gam.m2, | ||
+ | gam.lo=gam(wage~s(year, | ||
+ | plot.gam(gam.lo, | ||
+ | gam.lo.i=gam(wage~lo(year, | ||
+ | library(akima) | ||
+ | plot(gam.lo.i) | ||
+ | gam.lr=gam(I(wage> | ||
+ | par(mfrow=c(1, | ||
+ | plot(gam.lr, | ||
+ | table(education, | ||
+ | gam.lr.s=gam(I(wage> | ||
+ | plot(gam.lr.s, | ||
+ | |||
+ | |||
+ | |||
+ | # Chapter 8 Lab: Decision Trees | ||
+ | |||
+ | # Fitting Classification Trees | ||
+ | |||
+ | library(tree) | ||
+ | library(ISLR) | ||
+ | attach(Carseats) | ||
+ | High=ifelse(Sales< | ||
+ | Carseats=data.frame(Carseats, | ||
+ | tree.carseats=tree(High~.-Sales, | ||
+ | summary(tree.carseats) | ||
+ | plot(tree.carseats) | ||
+ | text(tree.carseats, | ||
+ | tree.carseats | ||
+ | set.seed(2) | ||
+ | train=sample(1: | ||
+ | Carseats.test=Carseats[-train, | ||
+ | High.test=High[-train] | ||
+ | tree.carseats=tree(High~.-Sales, | ||
+ | tree.pred=predict(tree.carseats, | ||
+ | table(tree.pred, | ||
+ | (86+57)/200 | ||
+ | set.seed(3) | ||
+ | cv.carseats=cv.tree(tree.carseats, | ||
+ | names(cv.carseats) | ||
+ | cv.carseats | ||
+ | par(mfrow=c(1, | ||
+ | plot(cv.carseats$size, | ||
+ | plot(cv.carseats$k, | ||
+ | prune.carseats=prune.misclass(tree.carseats, | ||
+ | plot(prune.carseats) | ||
+ | text(prune.carseats, | ||
+ | tree.pred=predict(prune.carseats, | ||
+ | table(tree.pred, | ||
+ | (94+60)/200 | ||
+ | prune.carseats=prune.misclass(tree.carseats, | ||
+ | plot(prune.carseats) | ||
+ | text(prune.carseats, | ||
+ | tree.pred=predict(prune.carseats, | ||
+ | table(tree.pred, | ||
+ | (86+62)/200 | ||
+ | |||
+ | # Fitting Regression Trees | ||
+ | |||
+ | library(MASS) | ||
+ | set.seed(1) | ||
+ | train = sample(1: | ||
+ | tree.boston=tree(medv~., | ||
+ | summary(tree.boston) | ||
+ | plot(tree.boston) | ||
+ | text(tree.boston, | ||
+ | cv.boston=cv.tree(tree.boston) | ||
+ | plot(cv.boston$size, | ||
+ | prune.boston=prune.tree(tree.boston, | ||
+ | plot(prune.boston) | ||
+ | text(prune.boston, | ||
+ | yhat=predict(tree.boston, | ||
+ | boston.test=Boston[-train," | ||
+ | plot(yhat, | ||
+ | abline(0,1) | ||
+ | mean((yhat-boston.test)^2) | ||
+ | |||
+ | # Bagging and Random Forests | ||
+ | |||
+ | install.packages(" | ||
+ | library(randomForest) | ||
+ | set.seed(1) | ||
+ | bag.boston=randomForest(medv~., | ||
+ | bag.boston | ||
+ | yhat.bag = predict(bag.boston, | ||
+ | plot(yhat.bag, | ||
+ | abline(0,1) | ||
+ | mean((yhat.bag-boston.test)^2) | ||
+ | bag.boston=randomForest(medv~., | ||
+ | yhat.bag = predict(bag.boston, | ||
+ | mean((yhat.bag-boston.test)^2) | ||
+ | set.seed(1) | ||
+ | rf.boston=randomForest(medv~., | ||
+ | yhat.rf = predict(rf.boston, | ||
+ | mean((yhat.rf-boston.test)^2) | ||
+ | importance(rf.boston) | ||
+ | varImpPlot(rf.boston) | ||
+ | |||
+ | # Boosting | ||
+ | |||
+ | install.packages(" | ||
+ | library(gbm) | ||
+ | set.seed(1) | ||
+ | boost.boston=gbm(medv~., | ||
+ | summary(boost.boston) | ||
+ | par(mfrow=c(1, | ||
+ | plot(boost.boston, | ||
+ | plot(boost.boston, | ||
+ | yhat.boost=predict(boost.boston, | ||
+ | mean((yhat.boost-boston.test)^2) | ||
+ | boost.boston=gbm(medv~., | ||
+ | yhat.boost=predict(boost.boston, | ||
+ | mean((yhat.boost-boston.test)^2) | ||
+ | |||
+ | |||
+ | |||
+ | # Chapter 9 Lab: Support Vector Machines | ||
+ | |||
+ | # Support Vector Classifier | ||
+ | |||
+ | set.seed(1) | ||
+ | x=matrix(rnorm(20*2), | ||
+ | y=c(rep(-1, | ||
+ | x[y==1, | ||
+ | plot(x, col=(3-y)) | ||
+ | dat=data.frame(x=x, | ||
+ | library(e1071) | ||
+ | svmfit=svm(y~., | ||
+ | plot(svmfit, | ||
+ | svmfit$index | ||
+ | summary(svmfit) | ||
+ | svmfit=svm(y~., | ||
+ | plot(svmfit, | ||
+ | svmfit$index | ||
+ | set.seed(1) | ||
+ | tune.out=tune(svm, | ||
+ | summary(tune.out) | ||
+ | bestmod=tune.out$best.model | ||
+ | summary(bestmod) | ||
+ | xtest=matrix(rnorm(20*2), | ||
+ | ytest=sample(c(-1, | ||
+ | xtest[ytest==1, | ||
+ | testdat=data.frame(x=xtest, | ||
+ | ypred=predict(bestmod, | ||
+ | table(predict=ypred, | ||
+ | svmfit=svm(y~., | ||
+ | ypred=predict(svmfit, | ||
+ | table(predict=ypred, | ||
+ | x[y==1, | ||
+ | plot(x, col=(y+5)/ | ||
+ | dat=data.frame(x=x, | ||
+ | svmfit=svm(y~., | ||
+ | summary(svmfit) | ||
+ | plot(svmfit, | ||
+ | svmfit=svm(y~., | ||
+ | summary(svmfit) | ||
+ | plot(svmfit, | ||
+ | |||
+ | # Support Vector Machine | ||
+ | |||
+ | set.seed(1) | ||
+ | x=matrix(rnorm(200*2), | ||
+ | x[1: | ||
+ | x[101: | ||
+ | y=c(rep(1, | ||
+ | dat=data.frame(x=x, | ||
+ | plot(x, col=y) | ||
+ | train=sample(200, | ||
+ | svmfit=svm(y~., | ||
+ | plot(svmfit, | ||
+ | summary(svmfit) | ||
+ | svmfit=svm(y~., | ||
+ | plot(svmfit, | ||
+ | set.seed(1) | ||
+ | tune.out=tune(svm, | ||
+ | summary(tune.out) | ||
+ | table(true=dat[-train," | ||
+ | |||
+ | # ROC Curves | ||
+ | |||
+ | install.packages(" | ||
+ | library(ROCR) | ||
+ | rocplot=function(pred, | ||
+ | predob = prediction(pred, | ||
+ | perf = performance(predob, | ||
+ | plot(perf, | ||
+ | svmfit.opt=svm(y~., | ||
+ | fitted=attributes(predict(svmfit.opt, | ||
+ | par(mfrow=c(1, | ||
+ | rocplot(fitted, | ||
+ | svmfit.flex=svm(y~., | ||
+ | fitted=attributes(predict(svmfit.flex, | ||
+ | rocplot(fitted, | ||
+ | fitted=attributes(predict(svmfit.opt, | ||
+ | rocplot(fitted, | ||
+ | fitted=attributes(predict(svmfit.flex, | ||
+ | rocplot(fitted, | ||
+ | |||
+ | # SVM with Multiple Classes | ||
+ | |||
+ | set.seed(1) | ||
+ | x=rbind(x, matrix(rnorm(50*2), | ||
+ | y=c(y, rep(0,50)) | ||
+ | x[y==0, | ||
+ | dat=data.frame(x=x, | ||
+ | par(mfrow=c(1, | ||
+ | plot(x, | ||
+ | svmfit=svm(y~., | ||
+ | plot(svmfit, | ||
+ | |||
+ | # Application to Gene Expression Data | ||
+ | |||
+ | library(ISLR) | ||
+ | names(Khan) | ||
+ | dim(Khan$xtrain) | ||
+ | dim(Khan$xtest) | ||
+ | length(Khan$ytrain) | ||
+ | length(Khan$ytest) | ||
+ | table(Khan$ytrain) | ||
+ | table(Khan$ytest) | ||
+ | dat=data.frame(x=Khan$xtrain, | ||
+ | out=svm(y~., | ||
+ | summary(out) | ||
+ | table(out$fitted, | ||
+ | dat.te=data.frame(x=Khan$xtest, | ||
+ | pred.te=predict(out, | ||
+ | table(pred.te, | ||
+ | |||
+ | |||
+ | |||
+ | # Chapter 10 Lab 1: Principal Components Analysis | ||
+ | |||
+ | states=row.names(USArrests) | ||
+ | states | ||
+ | names(USArrests) | ||
+ | apply(USArrests, | ||
+ | apply(USArrests, | ||
+ | pr.out=prcomp(USArrests, | ||
+ | names(pr.out) | ||
+ | pr.out$center | ||
+ | pr.out$scale | ||
+ | pr.out$rotation | ||
+ | dim(pr.out$x) | ||
+ | biplot(pr.out, | ||
+ | pr.out$rotation=-pr.out$rotation | ||
+ | pr.out$x=-pr.out$x | ||
+ | biplot(pr.out, | ||
+ | pr.out$sdev | ||
+ | pr.var=pr.out$sdev^2 | ||
+ | pr.var | ||
+ | pve=pr.var/ | ||
+ | pve | ||
+ | plot(pve, xlab=" | ||
+ | plot(cumsum(pve), | ||
+ | a=c(1, | ||
+ | cumsum(a) | ||
+ | |||
+ | |||
+ | # Chapter 10 Lab 2: Clustering | ||
+ | |||
+ | # K-Means Clustering | ||
+ | |||
+ | set.seed(2) | ||
+ | x=matrix(rnorm(50*2), | ||
+ | x[1: | ||
+ | x[1: | ||
+ | km.out=kmeans(x, | ||
+ | km.out$cluster | ||
+ | plot(x, col=(km.out$cluster+1), | ||
+ | set.seed(4) | ||
+ | km.out=kmeans(x, | ||
+ | km.out | ||
+ | plot(x, col=(km.out$cluster+1), | ||
+ | set.seed(3) | ||
+ | km.out=kmeans(x, | ||
+ | km.out$tot.withinss | ||
+ | km.out=kmeans(x, | ||
+ | km.out$tot.withinss | ||
+ | |||
+ | # Hierarchical Clustering | ||
+ | |||
+ | hc.complete=hclust(dist(x), | ||
+ | hc.average=hclust(dist(x), | ||
+ | hc.single=hclust(dist(x), | ||
+ | par(mfrow=c(1, | ||
+ | plot(hc.complete, | ||
+ | plot(hc.average, | ||
+ | plot(hc.single, | ||
+ | cutree(hc.complete, | ||
+ | cutree(hc.average, | ||
+ | cutree(hc.single, | ||
+ | cutree(hc.single, | ||
+ | xsc=scale(x) | ||
+ | plot(hclust(dist(xsc), | ||
+ | x=matrix(rnorm(30*3), | ||
+ | dd=as.dist(1-cor(t(x))) | ||
+ | plot(hclust(dd, | ||
+ | |||
+ | |||
+ | # Chapter 10 Lab 3: NCI60 Data Example | ||
+ | |||
+ | # The NCI60 data | ||
+ | |||
+ | library(ISLR) | ||
+ | nci.labs=NCI60$labs | ||
+ | nci.data=NCI60$data | ||
+ | dim(nci.data) | ||
+ | nci.labs[1: | ||
+ | table(nci.labs) | ||
+ | |||
+ | # PCA on the NCI60 Data | ||
+ | |||
+ | pr.out=prcomp(nci.data, | ||
+ | Cols=function(vec){ | ||
+ | cols=rainbow(length(unique(vec))) | ||
+ | return(cols[as.numeric(as.factor(vec))]) | ||
+ | } | ||
+ | par(mfrow=c(1, | ||
+ | plot(pr.out$x[, | ||
+ | plot(pr.out$x[, | ||
+ | summary(pr.out) | ||
+ | plot(pr.out) | ||
+ | pve=100*pr.out$sdev^2/ | ||
+ | par(mfrow=c(1, | ||
+ | plot(pve, | ||
+ | plot(cumsum(pve), | ||
+ | |||
+ | # Clustering the Observations of the NCI60 Data | ||
+ | |||
+ | sd.data=scale(nci.data) | ||
+ | par(mfrow=c(1, | ||
+ | data.dist=dist(sd.data) | ||
+ | plot(hclust(data.dist), | ||
+ | plot(hclust(data.dist, | ||
+ | plot(hclust(data.dist, | ||
+ | hc.out=hclust(dist(sd.data)) | ||
+ | hc.clusters=cutree(hc.out, | ||
+ | table(hc.clusters, | ||
+ | par(mfrow=c(1, | ||
+ | plot(hc.out, | ||
+ | abline(h=139, | ||
+ | hc.out | ||
+ | set.seed(2) | ||
+ | km.out=kmeans(sd.data, | ||
+ | km.clusters=km.out$cluster | ||
+ | table(km.clusters, | ||
+ | hc.out=hclust(dist(pr.out$x[, | ||
+ | plot(hc.out, | ||
+ | table(cutree(hc.out, | ||
+ | </ |