文書の過去の版を表示しています。
学習誤差と予測誤差
課題
今回は比較的単純な課題です.
+ 線形学習機械の最小二乗学習とk最近接機械の誤判別率を,シミュレーションを用いて比較しなさい. + 今回のシミュレーション設定に対して,最適なkを決めてみなさい. + 学習用データによる誤判別率の推定と検証ゥデータによる誤判別率の推定を比較して考察しなさい.
〆切は来週の月曜日の2限が始まる時刻まで,とします.
コードは,解説付きのコードの一番下に「貼り付け用」を別に用意したので,そちらを使うと良いです
コード
# データを発生させる関数
generate.data <- function(n, p, k, setting) {
# 最初に空の変数を作るとあとのコードが便利
X <- NULL
y <- NULL
if( setting==1 ) {
X <- rbind(X,
cbind(rnorm(ceiling(n/2), mean=0, sd=1),
rnorm(ceiling(n/2), mean=0, sd=1)) )
y <- rbind(y,
as.matrix(array(0, dim=c(ceiling(n/2)) ) ) )
X <- rbind(X,
cbind(rnorm(floor(n/2), mean=2, sd=1),
rnorm(floor(n/2), mean=2, sd=1)) )
y <- rbind(y,
as.matrix(array(1, dim=c(floor(n/2)) ) ) )
Data <- cbind(X,y)
colnames(Data) <- c("X.1", "X.2", "y")
# 最後にデータのランダムな並べ替え
Data.ret <- Data[sample(c(1:n)),]
return(Data.ret)
}
}
# データセットを最初のn.learnレコードを学習用に,残りを検証用に,分割する関数
split.data <- function(dataset, n.learn) {
data.learn <- dataset[c(1:n.learn),]
data.eval <- dataset[-c(1:n.learn),]
return(list(learn=data.learn, eval=data.eval))
}
# シミュレーション回数 m <- 1000 # データの変数の次元 (今回は未使用の変数) p <- 2 # クラス数 (これも今回は未使用の変数) k <- 2 # 学習用データのレコード数 n.learn <- 50 # 検証用データのレコード数 n.eval <- 20 # サンプル数 n <- n.learn + n.eval
# {}内に書かれたシミュレーションをm回繰り返す.
error.rate.eval <- NULL
error.rate.learn <- NULL
for( i in c(1:m) ) {
# データの生成と分割
data.gen <- generate.data(n,2,2,setting=1)
data.split <- split.data(data.gen, n.learn)
data.learn <- data.frame(data.split$learn)
data.eval <- data.frame(data.split$eval)
# 準備
error.temp.learn <- NULL
error.temp.eval <- NULL
# 各種学習機械の適用
# 線形学習機械の最小二乗学習
data.lm <- lm(y~X.1+X.2, data=data.learn)
data.fit <- fitted(data.lm)
data.pred <- predict(data.lm, newdata=data.eval)
# print(data.eval$y-data.pred)
# print(sum(abs(data.eval$y-data.pred)<0.5)/n.eval)
# print(data.learn$y-data.fit)
# print(sum(abs(data.learn$y-data.fit)<0.5)/n.learn)
# 学習用データの当てはめ誤差
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-data.fit)<0.5)/n.learn)
# 検証用データの当てはめ誤差
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-data.pred)<0.5)/n.eval)
# k-最近接機械でk=1
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=1, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=1, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# k-最近接機械でk=3
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=3, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=3, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# k-最近接機械でk=5
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=5, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=5, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# k-最近接機械でk=7
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=7, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=7, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# k-最近接機械でk=9
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=9, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=9, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# k-最近接機械でk=15
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=15, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=15, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# k-最近接機械でk=21
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=21, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=21, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# k-最近接機械でk=25
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=31, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=31, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# k-最近接機械でk=51
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=51, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=51, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# k-最近接機械でk=75
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=75, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=75, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# k-最近接機械でk=101
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=101, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=101, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# k-最近接機械でk=201
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=201, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=201, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# k-最近接機械でk=301
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=301, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=301, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# 上の結果の回収
error.rate.learn <- rbind(error.rate.learn, error.temp.learn)
error.rate.eval <- rbind(error.rate.eval, error.temp.eval)
}
# 最後に少しお化粧
colnames(error.rate.learn) <- c("lm", "knn.1", "knn.3", "knn.5", "knn.7",
"knn.9", "knn.15", "knn.21", "knn.25", "knn.51", "knn.75",
"knn.101", "knn.201", "knn.301")
rownames(error.rate.learn) <- c(1:m)
colnames(error.rate.eval) <- c("lm", "knn.1", "knn.3", "knn.5", "knn.7",
"knn.9", "knn.15", "knn.21", "knn.25", "knn.51", "knn.75",
"knn.101", "knn.201", "knn.301")
rownames(error.rate.eval) <- c(1:m)
箱ひげ図の描画.
boxplot(error.rate.learn) boxplot(error.rate.eval)
貼り付け用
準備
library(class)
generate.data <- function(n, p, k, setting) {
X <- NULL
y <- NULL
if( setting==1 ) {
X <- rbind(X,
cbind(rnorm(ceiling(n/2), mean=0, sd=1),
rnorm(ceiling(n/2), mean=0, sd=1)) )
y <- rbind(y,
as.matrix(array(0, dim=c(ceiling(n/2)) ) ) )
X <- rbind(X,
cbind(rnorm(floor(n/2), mean=2, sd=1),
rnorm(floor(n/2), mean=2, sd=1)) )
y <- rbind(y,
as.matrix(array(1, dim=c(floor(n/2)) ) ) )
Data <- cbind(X,y)
colnames(Data) <- c("X.1", "X.2", "y")
Data.ret <- Data[sample(c(1:n)),]
return(Data.ret)
}
}
split.data <- function(dataset, n.learn) {
data.learn <- dataset[c(1:n.learn),]
data.eval <- dataset[-c(1:n.learn),]
return(list(learn=data.learn, eval=data.eval))
}
設定
m <- 1000 p <- 2 k <- 2 n.learn <- 500 n.eval <- 200 n <- n.learn + n.eval
シミュレーション実験の実施
error.rate.learn <- NULL
error.rate.eval <- NULL
for( i in c(1:m) ) {
error.temp.learn <- NULL
error.temp.eval <- NULL
data.gen <- generate.data(n,2,2,setting=1)
data.split <- split.data(data.gen, n.learn)
data.learn <- data.frame(data.split$learn)
data.eval <- data.frame(data.split$eval)
# lm
data.lm <- lm(y~X.1+X.2, data=data.learn)
data.pred <- predict(data.lm, newdata=data.eval)
data.fit <- fitted(data.lm)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-data.fit)<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-data.pred)<0.5)/n.eval)
# knn:1
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=1, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=1, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# knn:3
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=3, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=3, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# knn:5
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=5, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=5, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# knn:7
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=7, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=7, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# knn:9
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=9, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=9, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# knn:15
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=15, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=15, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# knn:21
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=21, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=21, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# knn:25
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=31, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=31, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# knn:51
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=51, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=51, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# knn:75
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=75, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=75, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# knn:101
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=101, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=101, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# knn:201
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=201, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=201, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
# knn:301
data.fit <- knn(data.learn[,c(1:2)], data.learn[,c(1:2)], data.learn[,c(3)],
k=301, prob=FALSE)
data.pred <- knn(data.learn[,c(1:2)], data.eval[,c(1:2)], data.learn[,c(3)],
k=301, prob=FALSE)
error.temp.learn <- append(error.temp.learn,
1-sum(abs(data.learn$y-(as.numeric(data.fit)-1))<0.5)/n.learn)
error.temp.eval <- append(error.temp.eval,
1-sum(abs(data.eval$y-(as.numeric(data.pred)-1))<0.5)/n.eval)
error.rate.learn <- rbind(error.rate.learn, error.temp.learn)
error.rate.eval <- rbind(error.rate.eval, error.temp.eval)
}
colnames(error.rate.learn) <- c("lm", "knn.1", "knn.3", "knn.5", "knn.7", "knn.9",
"knn.15", "knn.21", "knn.25", "knn.51", "knn.75",
"knn.101", "knn.201", "knn.301")
rownames(error.rate.learn) <- c(1:m)
colnames(error.rate.eval) <- c("lm", "knn.1", "knn.3", "knn.5", "knn.7", "knn.9",
"knn.15", "knn.21", "knn.25", "knn.51", "knn.75",
"knn.101", "knn.201", "knn.301")
rownames(error.rate.eval) <- c(1:m)
結果のグラフ
boxplot(error.rate.learn)
結果のグラフ
boxplot(error.rate.eval)