The logistic regression of R language

Last Update:2017-11-27 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

This paper mainly introduces the realization of logistic regression, the test of model, etc.

Reference Blog http://blog.csdn.net/tiaaaaa/article/details/58116346;http://blog.csdn.net/ai_vivi/article/details/43836641

1. Test set and training set (3:7 scale) data source: http://archive.ics.uci.edu/ml/datasets/statlog+ (Australian+credit+approval)

Austra=read.table ("Australian.dat") head (Austra) #预览前6行N =length (austra$v15) #690行, 15 columns #ind=1,ind= 2 Ind=sample (2,n,replace=true,prob=c (0.7,0.3)) aus_train=austra[ind==1,]aus_test=austra[ind==2, respectively, with the probability of 0.7,0.3,]

2. Realization and prediction of logistic regression

PRE=GLM (V15~.,data=aus_train,family=binomial (link= "logit")) Summary (PRE) REAL=AUS_TEST$V15PREDICT_=PREDICT.GLM ( Pre,type= "Response", Newdata=aus_test) Predict=ifelse (predict_>0.5,1,0) aus_test$predict=predicthead (aus_test) #write. csv (aus_test, "Aus_test.csv")

3. Model Checking

Res=data.frame (real,predict) n=nrow (Aus_train)
#计算Cox-snell goodness of Fit r2=1-exp ((pre$deviance-pre$null.deviance)/N) Cat ("Cox-snell r2=", R2, "\ n")   
#计算Nagelkerke拟合优度R2 =r2/(1-exp (-pre$null.deviance)/N) Cat ("Nagelkerke r2=", R2, "\ n")   #Nagelkerke r2= 0.7379711 #模型其他指标 #residuals (PRE)     #残差 #coefficients (PRE)  #系数 #anova (pre)         #方差

4. Accuracy and precision

TRUE_VALUE=AUS_TEST[,15]PREDICT_VALUE=AUS_TEST[,16] #计算模型精度error =predict_value-true_value# to determine the proportion  of the correct number in the total Accuracy= (Nrow (aus_test)-sum (ABS (Error)))/nrow (aus_test) #混淆矩阵中的量 (The confusion matrix is explained on the next page) #真实值预测值全为1/predicted value is all 1---the correct number of information extracted Number of extracted information bars  precision=sum (True_value & Predict_value)/sum (Predict_value) #真实值预测值全为1/True value is all 1---the correct number of messages extracted The number of information bars in the sample  Recall=sum (Predict_value & True_value)/sum (true_value) #P和R指标有时候会出现的矛盾的情况, so you need to consider them comprehensively, The most common method is F-measure (also known as F-score) f_measure=2*precision*recall/(precision+recall)    # F-measure is a weighted harmonic average of precision and recall and is a comprehensive evaluation indicator  #输出以上各结果  print (accuracy)  print (precision)  print ( Recall)  print (f_measure)  #混淆矩阵 with the results of TP, FN, FP, TN

5. ROC Curve

#ROC曲线 (ROC Curve detailed explanation see next page) # method 1  #install. Packages ("ROCR")    library (ROCR)       pred <-Prediction (predict_,true _value)   #预测值 (predicted values before 0.5 two classification) and real values     performance (pred, ' AUC ') @y.values        #AUC值 perf <-  Performance ( pred, ' TPR ', ' FPR ')  plot (perf)  #方法2  #install. Packages ("PROC")  library (PROC)  Modelroc <- ROC (true_value,predict.)  Plot (Modelroc, Print.auc=true, Auc.polygon=true,legacy.axes=true, Grid=c (0.1, 0.2),       grid.col=c ("Green", "red") , Max.auc.polygon=true,       auc.polygon.col= "Skyblue", Print.thres=true)        #画出ROC曲线, mark the coordinates, and mark the value of the  AUC #方法3, Define  tpr=rep (0,1000)  fpr=rep (0,1000) p=predict by Roc  .  For (i in 1:1000)    {     p0=i/1000;    Ypred<-1* (p>p0)      tpr[i]=sum (ypred*true_value)/sum (true_value)      fpr[i]=sum (ypred* (1-true_value))/ SUM (1-true_value)    }  Plot (fpr,tpr,type= "L", col=2)  points (c (0,1), C (0,1), type= "L", lty=2)

6. Replace the test set and training set selection method, with 10 cross-validation

Australian <-read.table ("Australian.dat") #将australian数据分成随机十等分 #install. Packages ("caret") #固定folds函数的分组 Set.seed (7) library (caret) folds <-createfolds (y=australian$v15,k=10) #构建for循环, 10 cross-validated test set accuracy, training set accuracy max=0 n Um=0 for (i in 1:10) {fold_test <-australian[folds[[i],] #取folds [[i]] as test set Fold_train <-australian[ -folds[[i]] # The rest of the data as training set print ("* * * * * * * * *") Fold_pre <-GLM (V15 ~.,family=binomial (link= ' logit '), data= Fold_train) fold_predict <-predict (fold_pre,type= ' response ', newdata=fold_test) fold_predict =ifelse (fold_predic t>0.5,1,0) fold_test$predict = fold_predict Fold_error = fold_test[,16]-fold_test[,15] Fold_accuracy = (Nrow (f old_test)-sum (ABS (Fold_error))/nrow (fold_test) print (i) Print ("Test set accuracy * * * * * *") print (fold_accuracy) print ("* * * Training Set Accuracy * * * ") Fold_predict2 <-Predict (fold_pre,type= ' response ', Newdata=fold_train) fold_predict2 =ifelse (FOLD_PR edict2>0.5,1,0) FOLD_TRAIN$PREdict = Fold_predict2 Fold_error2 = fold_train[,16]-fold_train[,15] Fold_accuracy2 = (Nrow (fold_train)-sum (ABS (fold_        ERROR2))/nrow (fold_train) print (fold_accuracy2) if (Fold_accuracy>max) {max=fold_accuracy  Num=i}} print (max) print (num) # #结果可以看到, precision accuracy max, take Folds[[num]] as the test set, the rest as the training set.

7.10 Cross-validation accuracy

#十折里测试集最大精确度的结果  testi <-Australian[folds[[num]] [  traini <-Australian[-folds[[num]],]   # The remaining folds as training set  Prei <-glm (V15 ~.,family=binomial (link= ' logit '), Data=traini)  predicti <-PREDICT.GLM ( Prei,type= ' response ', newdata=testi)  predicti =ifelse (predicti>0.5,1,0)  testi$predict = predicti  # Write.csv (Testi, "Ausfold_test.csv")  Errori = testi[,16]-testi[,15]  accuracyi = (Nrow (testi)-sum (ABS (Errori ))/nrow (testi)     #十折里训练集的精确度  predicti2 <-predict.glm (prei,type= ' response ', Newdata=traini)  Predicti2 =ifelse (predicti2>0.5,1,0)  traini$predict = predicti2  errori2 = traini[,16]-traini[,15]  Accuracyi2 = (Nrow (traini)-sum (ABS (ERRORI2)))/nrow (Traini)     #测试集精确度, take the group I, training set accurate  accuracyi;num; Accuracyi2  #write. csv (Traini, "Ausfold_train.csv")

Confusion matrix

		Forecast
		1	0
Real	1	True Positive (TP)	True Negative (TN)	Actual Positive (TP+TN)
Inter -	0	False Positive (FP)	False Negative (FN)	Actual Negative (FP+FN)
		Predicted Positive (TP+FP)	Predicted negative (TN+FN)	(TP+TN+FP+FN)

accuracyrate (accuracy rate ) : (TP+TN)/(TP+TN+FN+FP)

errorrate (Rate of error ) : (FN+FP)/(TP+TN+FN+FP)

Recall (recall rate, recall , hit probability ) : tp/(TP+FN), How many of the Groundtruth are identified as positive samples ;

Precision (Precision ): tp/(TP+FP), how many of the identified positive samples are true positive samples;

TPR (truepositive rate): tp/(TP+FN), is actually Recall

Far (falseacceptance rate) or FPR (False Positive rate): fp/(FP+TN), error reception rate, false alarm rate, in all groundtruth negative samples of how many are identified as positive samples ;

FRR (falserejection rate): fn/(TP+FN), error rejection rate, rejection truthful, how many of the Groundtruth are positive samples are recognized as negative samples, it equals 1-recall

ROC Curve (receiver operating characteristic curve)

The transverse axis is far and the longitudinal axes are Recall;
The recognition result of each threshold corresponds to a point (FPR,TPR), when the threshold value is maximum, all samples are identified as negative samples, corresponding to the upper right corner of the point (0,0), when the threshold value is minimal, all samples are identified as positive samples, corresponding to the upper right corner of the point (in), As the threshold changes from maximum to minimum,both TP and FP increase gradually.
A good classification model should be located in the upper left corner of the image as much as possible, while a stochastic guessing model should be locatedon the main diagonal of the connection point (tpr=0,fpr=0) and (tpr=1,fpr=1);
The algorithm can be measured using the area AUC (areaunder roc Curve) value below the ROC curve: If the model is perfect, then its = 1, if the model is a simple random guessing model, then its = 0.5, if one model is better than the other, it has a relatively large area below the curve;
ERR (Equal error rate, equal error rates): Farand FRR are two parameters of the same algorithm system, placing it in the same coordinate. Far is decreased with the increase of the threshold value, and theFRR is increased with the increase of the threshold value. So they must have intersections. This point is the point of the far and FRR equivalent under a certain threshold value . It is customary to use this value to measure the overall performance of the algorithm. For a better fingerprint algorithm, it is hoped that in the same threshold conditions, farand frr are smaller and better.

The logistic regression of R language

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

The logistic regression of R language

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support

The logistic regression of R language

Contact Us

What's Trending

Top 10 Tags

Top 10 Keywords

Trending Topic

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support