Library (' Ggplot2 ')
DF <-read.csv (' g:\\dataguru\\ml_for_hackers\\ml_for_hackers-master\\12-model_comparison\\data\\df.csv ')
#用glm
Logit.fit <-GLM (Label ~ X + y,family = binomial (link = ' logit '), data = DF)
Logit.predictions <-IfElse (Predict (Logit.fit) > 0, 1, 0)
Mean (with (df, logit.predictions = = Label))
#正确率 0.5156, with the same results as the guess.
Library (' e1071 ')
Svm.fit <-SVM (Label ~ X + Y, data = DF)
Svm.predictions <-IfElse (Predict (Svm.fit) > 0, 1, 0)
Mean (with (df, svm.predictions = = Label))
#改用SVM, correct rate 72%
Library ("reshape")
#df中的字段, X,Y,LABEL,LOGIT,SVM
DF <-Cbind (df,data.frame (Logit = IfElse (Predict (Logit.fit) > 0, 1, 0), SVM = IfElse (Predict (Svm.fit) > 0, 1, 0)))
#melt的结果, increase the field variable, where the value is LABEL,LOGIT,SVM, increment the field value, and take the corresponding value according to variable
#melt函数: Specify the variable, and the other remaining fields as a column, listing the corresponding values. Melt and cast, as if the opposite function
Predictions <-Melt (df, id.vars = C (' X ', ' Y '))
Ggplot (Predictions, AES (x = x, y = y, color = factor (value))) + geom_point () + facet_grid (variable ~.)
#如, the label is the real result, LGM completely useless, and SVM is recognized, but the edge is not
#SVM函数有个kernel参数, there are 4 values: Linear,polynomial,radial and sigmoid,4.
DF <-df[, C (' X ', ' Y ', ' Label ')]
Linear.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' linear ')
With (DF, mean (Label = = IfElse (Predict (Linear.svm.fit) > 0, 1, 0)))
Polynomial.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' polynomial ')
With (DF, mean (Label = = IfElse (Predict (Polynomial.svm.fit) > 0, 1, 0)))
Radial.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' radial ')
With (DF, mean (Label = = IfElse (Predict (Radial.svm.fit) > 0, 1, 0)))
Sigmoid.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' sigmoid ')
With (DF, mean (Label = = IfElse (Predict (Sigmoid.svm.fit) > 0, 1, 0)))
DF <-Cbind (DF,
Data.frame (LINEARSVM = IfElse (Predict (Linear.svm.fit) > 0, 1, 0),
POLYNOMIALSVM = IfElse (Predict (Polynomial.svm.fit) > 0, 1, 0),
RADIALSVM = IfElse (Predict (Radial.svm.fit) > 0, 1, 0),
SIGMOIDSVM = IfElse (Predict (Sigmoid.svm.fit) > 0, 1, 0)))
Predictions <-Melt (df, id.vars = C (' X ', ' Y '))
Ggplot (Predictions, AES (x = x, y = y, color = factor (value))) + geom_point () + facet_grid (variable ~.)
#如, linear and polynomial are useless, radial is OK, sigmoid is strange
#svm有个参数叫degree, look at the effect.
Polynomial.degree3.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' polynomial ', degree = 3)
With (DF, mean (Label! = IfElse (Predict (Polynomial.degree3.svm.fit) > 0, 1, 0)))
Polynomial.degree5.svm.fit <-SVM (Label ~ X + y,data = Df,kernel = ' polynomial ', degree = 5)
With (DF, mean (Label! = IfElse (Predict (Polynomial.degree5.svm.fit) > 0, 1, 0)))
Polynomial.degree10.svm.fit <-SVM (Label ~ X + y,data = Df,kernel = ' polynomial ', degree = 10)
With (DF, mean (Label! = IfElse (Predict (Polynomial.degree10.svm.fit) > 0, 1, 0)))
Polynomial.degree12.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' polynomial ', degree = 12)
With (DF, mean (Label! = IfElse (Predict (Polynomial.degree12.svm.fit) > 0, 1, 0)))
DF <-df[, C (' X ', ' Y ', ' Label ')]
DF <-Cbind (DF,
Data.frame (DEGREE3SVM = IfElse (Predict (Polynomial.degree3.svm.fit) > 0, 1, 0),
DEGREE5SVM = IfElse (Predict (Polynomial.degree5.svm.fit) > 0, 1, 0),
DEGREE10SVM = IfElse (Predict (Polynomial.degree10.svm.fit) > 0, 1, 0),
DEGREE12SVM = IfElse (Predict (Polynomial.degree12.svm.fit) > 0, 1, 0)))
Predictions <-Melt (df, id.vars = C (' X ', ' Y '))
Ggplot (Predictions, AES (x = x, y = y, color = factor (value))) + geom_point () + facet_grid (variable ~.)
#从图上看, the degreee is improved and the accuracy is increased, and there is an overfitting problem, so when using the polynomial kernel function, the degree is cross-validated
#接下来研究一下SVM的cost参数
Radial.cost1.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' radial ', cost = 1)
With (DF, mean (Label = = IfElse (Predict (Radial.cost1.svm.fit) > 0, 1, 0)))
Radial.cost2.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' radial ', cost = 2)
With (DF, mean (Label = = IfElse (Predict (Radial.cost2.svm.fit) > 0, 1, 0)))
Radial.cost3.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' radial ', cost = 3)
With (DF, mean (Label = = IfElse (Predict (Radial.cost3.svm.fit) > 0, 1, 0)))
Radial.cost4.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' radial ', cost = 4)
With (DF, mean (Label = = IfElse (Predict (Radial.cost4.svm.fit) > 0, 1, 0)))
DF <-df[, C (' X ', ' Y ', ' Label ')]
DF <-Cbind (DF,
Data.frame (COST1SVM = IfElse (Predict (Radial.cost1.svm.fit) > 0, 1, 0),
COST2SVM = IfElse (Predict (Radial.cost2.svm.fit) > 0, 1, 0),
COST3SVM = IfElse (Predict (Radial.cost3.svm.fit) > 0, 1, 0),
COST4SVM = IfElse (Predict (Radial.cost4.svm.fit) > 0, 1, 0)))
Predictions <-Melt (df, id.vars = C (' X ', ' Y '))
Ggplot (Predictions, AES (x = x, y = y, color = factor (value))) + geom_point () + facet_grid (variable ~.)
#,cost parameter value promotion makes the effect more and more poor, the change is very small, only through the edge data to perceive that the effect is getting worse
#再来看SVM的参数gamma
Sigmoid.gamma1.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' sigmoid ', gamma = 1)
With (DF, mean (Label = = IfElse (Predict (Sigmoid.gamma1.svm.fit) > 0, 1, 0)))
Sigmoid.gamma2.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' sigmoid ', gamma = 2)
With (DF, mean (Label = = IfElse (Predict (Sigmoid.gamma2.svm.fit) > 0, 1, 0)))
Sigmoid.gamma3.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' sigmoid ', gamma = 3)
With (DF, mean (Label = = IfElse (Predict (Sigmoid.gamma3.svm.fit) > 0, 1, 0)))
Sigmoid.gamma4.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' sigmoid ', gamma = 4)
With (DF, mean (Label = = IfElse (Predict (Sigmoid.gamma4.svm.fit) > 0, 1, 0)))
DF <-df[, C (' X ', ' Y ', ' Label ')]
DF <-Cbind (DF,
Data.frame (GAMMA1SVM = IfElse (Predict (Sigmoid.gamma1.svm.fit) > 0, 1, 0),
GAMMA2SVM = IfElse (Predict (Sigmoid.gamma2.svm.fit) > 0, 1, 0),
GAMMA3SVM = IfElse (Predict (Sigmoid.gamma3.svm.fit) > 0, 1, 0),
GAMMA4SVM = IfElse (Predict (Sigmoid.gamma4.svm.fit) > 0, 1, 0)))
Predictions <-Melt (df, id.vars = C (' X ', ' Y '))
Ggplot (Predictions, AES (x = x, y = y, color = factor (value))) + geom_point () + facet_grid (variable ~.)
#变弯曲了
#SVM介绍完毕, it means to meet the data set to adjust the parameters, the following comparison of the performance of SVM,GLM and KNN
Load (' G:\\dataguru\\ml_for_hackers\\ml_for_hackers-master\\12-model_comparison\\data\\dtm. RData ')
Set.seed (1)
#一半训练, half test
Training.indices <-Sort (sample (1:nrow (DTM), round (0.5 * nrow (DTM))))
Test.indices <-which (1:nrow (DTM)%in% training.indices)
Train.x <-dtm[training.indices, 3:ncol (DTM)]
TRAIN.Y <-dtm[training.indices, 1]
Test.x <-dtm[test.indices, 3:ncol (DTM)]
Test.y <-dtm[test.indices, 1]
RM (DTM)
Library (' Glmnet ')
Regularized.logit.fit <-glmnet (train.x, train.y, family = C (' binomial '))
lambdas <-Regularized.logit.fit$lambda
Performance <-Data.frame ()
For (lambda in lambdas)
{
Predictions <-predict (Regularized.logit.fit, test.x, s = lambda)
Predictions <-as.numeric (Predictions > 0)
MSE <-mean (predictions! = TEST.Y)
Performance <-Rbind (performance, data.frame (lambda = lambda, MSE = MSE))
}
Ggplot (Performance, AES (x = Lambda, y = MSE)) + geom_point () + SCALE_X_LOG10 ()
#有两个lambda对应的错误率是最小的, we chose the bigger one, because it means stronger regularization.
Best.lambda <-with (performance, Max (Lambda[which (MSE = = min (MSE))))
#算一下mse, 0.068
MSE <-with (subset (performance, Lambda = = Best.lambda), MSE)
#下面试一下SVM
Library (' e1071 ')
#这一步时间很长, because the data set is large, the linear kernel function takes a long time
Linear.svm.fit <-SVM (train.x, train.y, kernel = ' linear ')
Predictions <-predict (Linear.svm.fit, test.x)
Predictions <-as.numeric (Predictions > 0)
MSE <-mean (predictions! = TEST.Y)
Mse
#0.128, Error rate 12%, higher than GLM. To achieve optimal results, you should try different cost super-parameters
Radial.svm.fit <-SVM (train.x, train.y, kernel = ' radial ')
Predictions <-predict (Radial.svm.fit, test.x)
Predictions <-as.numeric (Predictions > 0)
MSE <-mean (predictions! = TEST.Y)
Mse
#错误率, 0.1421538, is higher than just now, so it is known that the radial kernel function is not effective, so the boundary may be linear. So the GLM effect will be better.
#下面试一下KNN, KNN is good for nonlinear effect
Library (' class ')
Knn.fit <-KNN (train.x, Test.x, train.y, k = 50)
Predictions <-as.numeric (As.character (Knn.fit))
MSE <-mean (predictions! = TEST.Y)
Mse
#错误率0.1396923, it is true that it is possible to have a linear model, and try to see which K works best.
Performance <-data.frame ()
for (k in SEQ (5, a by = 5))
{
Knn.fit <-KNN (train.x, Test.x, train.y, k = k)
Predictions <-as.numeric (As.character (Knn.fit))
MSE <-mean (predictions! = TEST.Y)
Performance <-Rbind (performance, data.frame (k = k, MSE = MSE))
}
BEST.K <-with (performance, k[which (MSE = = min (MSE)))
Best.mse <-with (subset (performance, K = = BEST.K), MSE)
Best.mse
#错误率降到0.09169231,KNN effect between GLM and SVM
#因此, the best choice is GLM.
Machine learning for hackers reading notes (12) model comparison