#数据准备loc <-"https://archive.ics.uci.edu/ml/machine-learning-databases/" ds<-"breast-cancer-wisconsin/ Breast-cancer-wisconsin.data "Url<-paste (loc,ds,sep=") breast<-read.table (url,sep= ",", Header=FALSE, na.strings= "?") Names (breast) <-c ("ID", "clumpthickness", "sizeuniformity", "shapeuniformity", "Maginaladhesion", " Singleepithelialcellsize "," Barenuclei "," Blandchromatin "," Normalnucleoli "," Mitosis "," class ") DF<-BREAST[-1]DF $class <-factor (Df$class,levels=c (2,4), labels=c ("Benign", "malignant")) Set.seed (1234) train<-sample (Nrow (DF ), 0.7*nrow (DF)) Df.train<-df[train,] #取行的意思df. validate<-df[-train,]table (Df.train$class)
Benign malignant
329 160
Table (Df.validate$class)
Benign malignant
129 81
#逻辑回归fit. LOGIT<-GLM (Class~.,data=df.train,family=binomial ()) Summary (fit.logit)
Call:
GLM (Formula = Class ~., family = binomial (), data = Df.train)
Deviance residuals:
Min 1Q Median 3Q Max
-2.75813-0.10602-0.05679 0.01237 2.64317
Coefficients:
Estimate Std. Error z value Pr (>|z|)
(Intercept) -10.42758 1.47602-7.065 1.61e-12 * * *
Clumpthickness 0.52434 0.15950 3.287 0.00101 * *
sizeUniformity-0.04805 0.25706-0.187 0.85171
Shapeuniformity 0.42309 0.26775 1.580 0.11407
Maginaladhesion 0.29245 0.14690 1.991 0.04650 *
Singleepithelialcellsize 0.11053 0.17980 0.615 0.53871
Barenuclei 0.33570 0.10715 3.133 0.00173 * *
Blandchromatin 0.42353 0.20673 2.049 0.04049 *
Normalnucleoli 0.28888 0.13995 2.064 0.03900 *
Mitosis 0.69057 0.39829 1.734 0.08295.
---
Signif. codes:0 ' * * * ' 0.001 ' * * ' 0.01 ' * ' 0.05 '. ' 0.1 "1
(Dispersion parameter for binomial family taken to be 1)
Null deviance:612.063 on 482 degrees of freedom
Residual deviance:71.346 on 473 degrees of freedom
(6 observations deleted due to missingness)
aic:91.346
Number of Fisher scoring Iterations:8
prob<-predict (fit.logit,df.validate,type= "response")Logit.pred<-factor (Prob>.5,levels=c ( False,true), Labels=c ("Begin", "malignant")) Logit.perf<-table (Df.validate$class,logit.pred,dnn=c ("Actual", " Predicted ")) Logit.perf
Predicted
Actual begin Malignant
Benign 118 2
Malignant 4 76
#决策树library (Rpart) set.seed (1234) Dtree<-rpart (class~.,data=df.train,method= "Class", Parms=list (split= " Information ")) dtree$cptable
CP (complexity) nsplit (branch size) Rel error (Error) Xerror (10 percent cross-validation error) XSTD (standard deviation of cross error)
1 0.800000 0 1.00000 1.00000 0.06484605
2 0.046875 1 0.20000 0.30625 0.04150018
3 0.012500 3 0.10625 0.20625 0.03467089
4 0.010000 4 0.09375 0.18125 0.03264401
Dtree.pruned<-prune (dtree,cp=.0125) #剪枝操作library (rpart.plot) PRP (dtree.pruned,type=2,extra=104, Fallen.leaves = true,main= "Decision Tree")
Dtree.pred<-predict (dtree.pruned,df.validate,type= "class") Dtree.perf<-table (df.validate$class,dtree.pred , Dnn=c ("Actual", "predicted")) Dtree.perf
Predicted
Actual Benign malignant
Benign 122 7
Malignant 2 79
#条件推断树install. Packages ("Party") The Library (Party) Fit.ctree<-ctree (Class~.,data=df.train) plot (fit.ctree,main= " Conditional Inference Tree ")
Ctree.pred<-predict (fit.ctree,df.validate,type= "Response") ctree.perf<-table (df.validate$class,ctree.pred , Dnn=c ("Actual", "predicted")) Ctree.perf
Predicted
Actual Benign malignant
Benign 122 7
Malignant 3 78
#随机森林install. Packages ("Randomforest") library (randomforest) set.seed (1234) fit.forest<-randomforest (class~., Data=df.train,na.action=na.roughfix,importance=true) fit.forest
Call:
Randomforest (Formula = Class ~., data = Df.train, importance = TRUE, na.action = Na.roughfix)
Type of Random Forest:classification
Number of trees:500
No. of variables tried at each split:3
OOB estimate of error rate:3.68%
Confusion Matrix:
Benign malignant class.error
Benign 319 10 0.03039514
Malignant 8 152 0.05000000
Importance (fit.forest,type=2) #输出变量的重要性
Forest.pred<-predict (fit.forest,df.validate) forest.perf<-table (Df.validate$class,forest.pred,dnn=c (" Actual "," predicted ")) Forest.perf
Predicted
Actual Benign malignant
Benign 117 3
Malignant 1 79
#支持向量机install. Packages ("e1071") library (e1071) set.seed (1234) FIT.SVM<-SVM (Class~.,data=df.train) FIT.SVM
Call:
SVM (Formula = Class ~., data = Df.train)
Parameters:
Svm-type:c-classification
Svm-kernel:radial
Cost:1
gamma:0.1111111
Number of support vectors:76
Svm.pred<-predict (Fit.svm,na.omit (df.validate)) svm.perf<-table (Na.omit (df.validate) $class, svm.pred,dnn=c ("Actual", "predicted")) Svm.perf
Predicted
Actual Benign malignant
Benign 116 4
Malignant 3 77
#带RBF核的SVM模型set. Seed (1234) TUNED<-TUNE.SVM (class~.,data=df.train,gamma=10^ ( -6:1), cost=10^ ( -10:10)) tuned
Parameter tuning of ' SVM ':
-Sampling Method:10-fold Cross Validation
-Best Parameters:
Gamma cost
0.01 1
-Best performance:0.02904092
FIT.SVM<-SVM (class~.,data=df.train,gamma=.01,cost=1) svm.pred<-predict (Fit.svm,na.omit (df.validate)) Svm.perf<-table (Na.omit (df.validate) $class, Svm.pred,dnn=c ("Actual", "predicted")) Svm.perf
Predicted
Actual Benign malignant
Benign 117 3
Malignant 3 77
#选择预测效果最好的解, evaluation Two classification accuracy performance<-function (table,n=2) { if (!all (table) ==c (2,2))) stop ("must be a 2x2 Table ") tn=table[1,1] fp=table[1,2] fn=table[2,1] tp=table[2,2] sensitivity=tp/(TP+FN) specificity=tn/(TN+FP) ppp=tp/(TP+FP) npp=tn/(TN+FN) hitrate= (TP+TN)/(TP+TN+FP+FN) Result<-paste ("sensitivity=", Round (Sensitivity,n), "\nspecificity =", round (Specificity,n), "\npositive Predictive value= ", round (Ppp,n)," \nnegative predictive value= ", round (Npp,n)," \naccuracy= ", Round (hitrate,n)," \ n ", Sep= "") Cat (Result)}performance (Logit.perf)
sensitivity=0.95
specificity = 0.98
Positive Predictive value=0.97
Negative predictive value=0.97
accuracy=0.97
Performance (DTREE.PERF)
sensitivity=0.98
specificity = 0.95
Positive Predictive value=0.92
Negative predictive value=0.98
accuracy=0.96
Performance (CTREE.PERF)
sensitivity=0.96
specificity = 0.95
Positive Predictive value=0.92
Negative predictive value=0.98
accuracy=0.95
Performance (CTREE.PERF)
sensitivity=0.96
specificity = 0.95
Positive Predictive value=0.92
Negative predictive value=0.98
accuracy=0.95
Performance (FOREST.PERF)
sensitivity=0.99
specificity = 0.98
Positive Predictive value=0.96
Negative predictive value=0.99
accuracy=0.98
Performance (SVM.PERF)
sensitivity=0.96
specificity = 0.98
Positive Predictive value=0.96
Negative predictive value=0.98
accuracy=0.97
#Rattle包library (Rattle) loc<-"https://archive.ics.uci.edu/ml/machine-learning-databases/" ds<-" Pima-indians-diabetes/pima-indians-diabetes.data "Url<-paste (loc,ds,sep=" ") diabetes<-read.table (url,sep=") , ", header=false) names (diabetes) <-c (" Npregant "," plasma "," BP "," triceps "," insulin "," BMI "," pedigree "," Age "," Class ") Diabetes$class<-factor (Diabetes$class,levels=c (0,1), labels=c (" normal "," diabetic ")) Rattle ()
Cv<-matrix (c (145,50,8,27), nrow=2) performance (As.table (CV))
R Language Learning Note (15): Categories