data<-' f:\\learning\\ml_for_hackers\\ml_for_hackers-master\\06-regularization\\data\\ '
Ranks <-read.csv (file.path (data, ' oreilly.csv '), stringsasfactors = FALSE)
Library (' TM ')
Documents <-data.frame (Text = Ranks$long.desc.)
Row.names (documents) <-1:nrow (documents)
#获得语料库
Corpus <-Corpus (Dataframesource (documents))
#R2版本用corpus <-Tm_map (Corpus, ToLower)
Corpus <-Tm_map (Corpus, Content_transformer (ToLower))
#R2版本用corpus <-Tm_map (Corpus, Stripwhitespace)
Corpus <-Tm_map (Corpus, Content_transformer (Stripwhitespace))
#去除英文停用词
Corpus <-Tm_map (Corpus, Removewords, Stopwords (' 中文版 '))
#得到词项文档矩阵
DTM <-Documenttermmatrix (Corpus)
X <-As.matrix (DTM)
Y <-Rev (1:100) #反转1: 100, the result is 100. 1
Set.seed (1)
Library (' Glmnet ')
Performance <-Data.frame ()
For (Lambda in C (0.1, 0.25, 0.5, 1, 2, 5))
{
For (i-1:50)
{
Indices <-sample (1:100, 80)
Training.x <-X[indices,]
Training.y <-Y[indices]
Test.x <-X[-indices,]
Test.y <-Y[-indices]
Glm.fit <-glmnet (training.x, TRAINING.Y)
Predicted.y <-Predict (Glm.fit, test.x, s = lambda)
Rmse <-sqrt (Mean ((PREDICTED.Y-TEST.Y) ^ 2))
Performance <-Rbind (performance,data.frame (Lambda = lambda,iteration = I,rmse = RMSE))
}
}
Ggplot (Performance, AES (x = Lambda, y = RMSE)) +stat_summary (fun.data = ' Mean_cl_boot ', Geom = ' errorbar ') +
Stat_summary (fun.data = ' Mean_cl_boot ', Geom = ' point ')
#从图上看, failure
#失败了作分类, Judge if a book could be in the top 50.
Y <-Rep (c (1, 0), each = 50)
#作逻辑回归
Regularized.fit <-glmnet (x, y, family = ' binomial ')
#预测一下
Predict (regularized.fit, newx = x, s = 0.001)
#出来的结果并不是分类, but a bunch of numbers, so change
IfElse (Predict (regularized.fit, newx = x, s = 0.001) > 0, 1, 0)
#第二种方法, turn the predicted result into a probability value
Library (' Boot ')
Inv.logit (Predict (regularized.fit, newx = x, s = 0.001))
#看效果
Set.seed (1)
Performance <-Data.frame ()
For (i in 1:250)
{
Indices <-sample (1:100, 80)
Training.x <-X[indices,]
Training.y <-Y[indices]
Test.x <-X[-indices,]
Test.y <-Y[-indices]
For (Lambda in C (0.0001, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.5, 0.1))
{
Glm.fit <-glmnet (training.x, training.y, family = ' binomial ')
Predicted.y <-IfElse (Predict (Glm.fit, test.x, s = lambda) > 0, 1, 0)
Error.rate <-mean (predicted.y! = test.y)
Performance <-Rbind (performance,data.frame (Lambda = lambda,iteration = I,errorrate = error.rate))
}
}
#画个图
Ggplot (Performance, AES (x = Lambda, y = errorrate)) +
Stat_summary (fun.data = ' Mean_cl_boot ', Geom = ' errorbar ') +
Stat_summary (fun.data = ' Mean_cl_boot ', Geom = ' point ') +scale_x_log10 ()
Machine learning for hackers reading notes (vi) regularization: text regression