Machine learning for hackers reading notes (12) model comparison

Source: Internet
Author: User
Tags svm vars ggplot

Library (' Ggplot2 ')
DF <-read.csv (' g:\\dataguru\\ml_for_hackers\\ml_for_hackers-master\\12-model_comparison\\data\\df.csv ')

#用glm

Logit.fit <-GLM (Label ~ X + y,family = binomial (link = ' logit '), data = DF)

Logit.predictions <-IfElse (Predict (Logit.fit) > 0, 1, 0)

Mean (with (df, logit.predictions = = Label))
#正确率 0.5156, with the same results as the guess.

Library (' e1071 ')

Svm.fit <-SVM (Label ~ X + Y, data = DF)

Svm.predictions <-IfElse (Predict (Svm.fit) > 0, 1, 0)

Mean (with (df, svm.predictions = = Label))

#改用SVM, correct rate 72%

Library ("reshape")

#df中的字段, X,Y,LABEL,LOGIT,SVM

DF <-Cbind (df,data.frame (Logit = IfElse (Predict (Logit.fit) > 0, 1, 0), SVM = IfElse (Predict (Svm.fit) > 0, 1, 0)))

#melt的结果, increase the field variable, where the value is LABEL,LOGIT,SVM, increment the field value, and take the corresponding value according to variable

#melt函数: Specify the variable, and the other remaining fields as a column, listing the corresponding values. Melt and cast, as if the opposite function

Predictions <-Melt (df, id.vars = C (' X ', ' Y '))

Ggplot (Predictions, AES (x = x, y = y, color = factor (value))) + geom_point () + facet_grid (variable ~.)

#如, the label is the real result, LGM completely useless, and SVM is recognized, but the edge is not

#SVM函数有个kernel参数, there are 4 values: Linear,polynomial,radial and sigmoid,4.

DF <-df[, C (' X ', ' Y ', ' Label ')]

Linear.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' linear ')

With (DF, mean (Label = = IfElse (Predict (Linear.svm.fit) > 0, 1, 0)))

Polynomial.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' polynomial ')

With (DF, mean (Label = = IfElse (Predict (Polynomial.svm.fit) > 0, 1, 0)))

Radial.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' radial ')

With (DF, mean (Label = = IfElse (Predict (Radial.svm.fit) > 0, 1, 0)))

Sigmoid.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' sigmoid ')

With (DF, mean (Label = = IfElse (Predict (Sigmoid.svm.fit) > 0, 1, 0)))

DF <-Cbind (DF,

Data.frame (LINEARSVM = IfElse (Predict (Linear.svm.fit) > 0, 1, 0),

POLYNOMIALSVM = IfElse (Predict (Polynomial.svm.fit) > 0, 1, 0),

RADIALSVM = IfElse (Predict (Radial.svm.fit) > 0, 1, 0),

SIGMOIDSVM = IfElse (Predict (Sigmoid.svm.fit) > 0, 1, 0)))

Predictions <-Melt (df, id.vars = C (' X ', ' Y '))

Ggplot (Predictions, AES (x = x, y = y, color = factor (value))) + geom_point () + facet_grid (variable ~.)

#如, linear and polynomial are useless, radial is OK, sigmoid is strange

#svm有个参数叫degree, look at the effect.

Polynomial.degree3.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' polynomial ', degree = 3)

With (DF, mean (Label! = IfElse (Predict (Polynomial.degree3.svm.fit) > 0, 1, 0)))

Polynomial.degree5.svm.fit <-SVM (Label ~ X + y,data = Df,kernel = ' polynomial ', degree = 5)

With (DF, mean (Label! = IfElse (Predict (Polynomial.degree5.svm.fit) > 0, 1, 0)))

Polynomial.degree10.svm.fit <-SVM (Label ~ X + y,data = Df,kernel = ' polynomial ', degree = 10)

With (DF, mean (Label! = IfElse (Predict (Polynomial.degree10.svm.fit) > 0, 1, 0)))

Polynomial.degree12.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' polynomial ', degree = 12)

With (DF, mean (Label! = IfElse (Predict (Polynomial.degree12.svm.fit) > 0, 1, 0)))

DF <-df[, C (' X ', ' Y ', ' Label ')]

DF <-Cbind (DF,

Data.frame (DEGREE3SVM = IfElse (Predict (Polynomial.degree3.svm.fit) > 0, 1, 0),

DEGREE5SVM = IfElse (Predict (Polynomial.degree5.svm.fit) > 0, 1, 0),

DEGREE10SVM = IfElse (Predict (Polynomial.degree10.svm.fit) > 0, 1, 0),

DEGREE12SVM = IfElse (Predict (Polynomial.degree12.svm.fit) > 0, 1, 0)))

Predictions <-Melt (df, id.vars = C (' X ', ' Y '))

Ggplot (Predictions, AES (x = x, y = y, color = factor (value))) + geom_point () + facet_grid (variable ~.)

#从图上看, the degreee is improved and the accuracy is increased, and there is an overfitting problem, so when using the polynomial kernel function, the degree is cross-validated

#接下来研究一下SVM的cost参数

Radial.cost1.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' radial ', cost = 1)

With (DF, mean (Label = = IfElse (Predict (Radial.cost1.svm.fit) > 0, 1, 0)))

Radial.cost2.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' radial ', cost = 2)

With (DF, mean (Label = = IfElse (Predict (Radial.cost2.svm.fit) > 0, 1, 0)))

Radial.cost3.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' radial ', cost = 3)

With (DF, mean (Label = = IfElse (Predict (Radial.cost3.svm.fit) > 0, 1, 0)))

Radial.cost4.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' radial ', cost = 4)

With (DF, mean (Label = = IfElse (Predict (Radial.cost4.svm.fit) > 0, 1, 0)))

DF <-df[, C (' X ', ' Y ', ' Label ')]

DF <-Cbind (DF,

Data.frame (COST1SVM = IfElse (Predict (Radial.cost1.svm.fit) > 0, 1, 0),

COST2SVM = IfElse (Predict (Radial.cost2.svm.fit) > 0, 1, 0),

COST3SVM = IfElse (Predict (Radial.cost3.svm.fit) > 0, 1, 0),

COST4SVM = IfElse (Predict (Radial.cost4.svm.fit) > 0, 1, 0)))

Predictions <-Melt (df, id.vars = C (' X ', ' Y '))

Ggplot (Predictions, AES (x = x, y = y, color = factor (value))) + geom_point () + facet_grid (variable ~.)

#,cost parameter value promotion makes the effect more and more poor, the change is very small, only through the edge data to perceive that the effect is getting worse

#再来看SVM的参数gamma

Sigmoid.gamma1.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' sigmoid ', gamma = 1)

With (DF, mean (Label = = IfElse (Predict (Sigmoid.gamma1.svm.fit) > 0, 1, 0)))

Sigmoid.gamma2.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' sigmoid ', gamma = 2)

With (DF, mean (Label = = IfElse (Predict (Sigmoid.gamma2.svm.fit) > 0, 1, 0)))

Sigmoid.gamma3.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' sigmoid ', gamma = 3)

With (DF, mean (Label = = IfElse (Predict (Sigmoid.gamma3.svm.fit) > 0, 1, 0)))

Sigmoid.gamma4.svm.fit <-SVM (Label ~ X + Y, data = df, kernel = ' sigmoid ', gamma = 4)

With (DF, mean (Label = = IfElse (Predict (Sigmoid.gamma4.svm.fit) > 0, 1, 0)))

DF <-df[, C (' X ', ' Y ', ' Label ')]

DF <-Cbind (DF,

Data.frame (GAMMA1SVM = IfElse (Predict (Sigmoid.gamma1.svm.fit) > 0, 1, 0),

GAMMA2SVM = IfElse (Predict (Sigmoid.gamma2.svm.fit) > 0, 1, 0),

GAMMA3SVM = IfElse (Predict (Sigmoid.gamma3.svm.fit) > 0, 1, 0),

GAMMA4SVM = IfElse (Predict (Sigmoid.gamma4.svm.fit) > 0, 1, 0)))

Predictions <-Melt (df, id.vars = C (' X ', ' Y '))

Ggplot (Predictions, AES (x = x, y = y, color = factor (value))) + geom_point () + facet_grid (variable ~.)

#变弯曲了

#SVM介绍完毕, it means to meet the data set to adjust the parameters, the following comparison of the performance of SVM,GLM and KNN

Load (' G:\\dataguru\\ml_for_hackers\\ml_for_hackers-master\\12-model_comparison\\data\\dtm. RData ')

Set.seed (1)

#一半训练, half test

Training.indices <-Sort (sample (1:nrow (DTM), round (0.5 * nrow (DTM))))

Test.indices <-which (1:nrow (DTM)%in% training.indices)

Train.x <-dtm[training.indices, 3:ncol (DTM)]

TRAIN.Y <-dtm[training.indices, 1]

Test.x <-dtm[test.indices, 3:ncol (DTM)]

Test.y <-dtm[test.indices, 1]

RM (DTM)

Library (' Glmnet ')

Regularized.logit.fit <-glmnet (train.x, train.y, family = C (' binomial '))

lambdas <-Regularized.logit.fit$lambda

Performance <-Data.frame ()

For (lambda in lambdas)

{

Predictions <-predict (Regularized.logit.fit, test.x, s = lambda)

Predictions <-as.numeric (Predictions > 0)

MSE <-mean (predictions! = TEST.Y)

Performance <-Rbind (performance, data.frame (lambda = lambda, MSE = MSE))

}

Ggplot (Performance, AES (x = Lambda, y = MSE)) + geom_point () + SCALE_X_LOG10 ()

#有两个lambda对应的错误率是最小的, we chose the bigger one, because it means stronger regularization.

Best.lambda <-with (performance, Max (Lambda[which (MSE = = min (MSE))))

#算一下mse, 0.068

MSE <-with (subset (performance, Lambda = = Best.lambda), MSE)

#下面试一下SVM

Library (' e1071 ')

#这一步时间很长, because the data set is large, the linear kernel function takes a long time

Linear.svm.fit <-SVM (train.x, train.y, kernel = ' linear ')

Predictions <-predict (Linear.svm.fit, test.x)

Predictions <-as.numeric (Predictions > 0)

MSE <-mean (predictions! = TEST.Y)

Mse

#0.128, Error rate 12%, higher than GLM. To achieve optimal results, you should try different cost super-parameters

Radial.svm.fit <-SVM (train.x, train.y, kernel = ' radial ')

Predictions <-predict (Radial.svm.fit, test.x)

Predictions <-as.numeric (Predictions > 0)

MSE <-mean (predictions! = TEST.Y)

Mse

#错误率, 0.1421538, is higher than just now, so it is known that the radial kernel function is not effective, so the boundary may be linear. So the GLM effect will be better.

#下面试一下KNN, KNN is good for nonlinear effect

Library (' class ')

Knn.fit <-KNN (train.x, Test.x, train.y, k = 50)

Predictions <-as.numeric (As.character (Knn.fit))

MSE <-mean (predictions! = TEST.Y)

Mse

#错误率0.1396923, it is true that it is possible to have a linear model, and try to see which K works best.

Performance <-data.frame ()

for (k in SEQ (5, a by = 5))

{

Knn.fit <-KNN (train.x, Test.x, train.y, k = k)

Predictions <-as.numeric (As.character (Knn.fit))

MSE <-mean (predictions! = TEST.Y)

Performance <-Rbind (performance, data.frame (k = k, MSE = MSE))

}

BEST.K <-with (performance, k[which (MSE = = min (MSE)))

Best.mse <-with (subset (performance, K = = BEST.K), MSE)

Best.mse

#错误率降到0.09169231,KNN effect between GLM and SVM

#因此, the best choice is GLM.

Machine learning for hackers reading notes (12) model comparison

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.