With Anaconda's Spyder: New train_test.py
#!usr/bin/env python#-*-coding:utf-8-*- ImportSYSImportOSImport Time fromSklearnImportMetricsImportNumPy as NPImportCpickle as Pickle reload (SYS) sys.setdefaultencoding ('UTF8') #multinomial Naive Bayes ClassifierdefNaive_bayes_classifier (train_x, train_y): fromSklearn.naive_bayesImportMULTINOMIALNB Model= MULTINOMIALNB (alpha=0.01) Model.fit (train_x, train_y)returnModel#KNN ClassifierdefKnn_classifier (train_x, train_y): fromSklearn.neighborsImportKneighborsclassifier Model=kneighborsclassifier () model.fit (train_x, train_y)returnModel#Logistic Regression ClassifierdefLogistic_regression_classifier (train_x, train_y): fromSklearn.linear_modelImportlogisticregression Model= Logisticregression (penalty='L2') Model.fit (train_x, train_y)returnModel#Random Forest ClassifierdefRandom_forest_classifier (train_x, train_y): fromSklearn.ensembleImportRandomforestclassifier Model= Randomforestclassifier (n_estimators=8) Model.fit (train_x, train_y)returnModel#decision Tree ClassifierdefDecision_tree_classifier (train_x, train_y): fromSklearnImportTree Model=Tree. Decisiontreeclassifier () Model.fit (train_x, train_y)returnModel#GBDT (Gradient boosting decision Tree) ClassifierdefGradient_boosting_classifier (train_x, train_y): fromSklearn.ensembleImportGradientboostingclassifier Model= Gradientboostingclassifier (n_estimators=200) Model.fit (train_x, train_y)returnModel#SVM ClassifierdefSvm_classifier (train_x, train_y): fromSklearn.svmImportSVC Model= SVC (kernel='RBF', probability=True) Model.fit (train_x, train_y)returnModel#SVM Classifier using cross validationdefsvm_cross_validation (train_x, train_y): fromSklearn.grid_searchImportGRIDSEARCHCV fromSklearn.svmImportSVC Model= SVC (kernel='RBF', probability=True) Param_grid= {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000],'Gamma': [0.001, 0.0001]} grid_search= GRIDSEARCHCV (model, param_grid, n_jobs = 1, verbose=1) Grid_search.fit (train_x, train_y) best_parameters=Grid_search.best_estimator_.get_params () forPara, ValinchBest_parameters.items ():PrintPara, Val model= SVC (kernel='RBF', c=best_parameters['C'], gamma=best_parameters['Gamma'], probability=True) Model.fit (train_x, train_y)returnModeldefRead_data (data_file):Importgzip F= Gzip.open (Data_file,"RB") Train, Val, test=pickle.load (f) f.close () train_x=Train[0] train_y= Train[1] test_x=Test[0] test_y= Test[1] returntrain_x, train_y, test_x, test_yif __name__=='__main__': Data_file="mnist.pkl.gz"Thresh= 0.5Model_save_file=None Model_save={} test_classifiers= ['NB','KNN','LR','RF','DT','SVM','GBDT'] classifiers= {'NB': Naive_bayes_classifier,'KNN': Knn_classifier,'LR': Logistic_regression_classifier,'RF': Random_forest_classifier,'DT':d Ecision_tree_classifier,'SVM': Svm_classifier,'SVMCV': Svm_cross_validation,'GBDT': Gradient_boosting_classifier}Print 'reading training and testing data ...'train_x, train_y, test_x, test_y=Read_data (data_file) num_train, Num_feat=train_x.shape num_test, Num_feat=Test_x.shape Is_binary_class= (Len (np.unique (train_y)) = = 2) Print '******************** Data Info *********************' Print '#training data:%d, #testing_data:%d, dimension: %d'%(Num_train, Num_test, num_feat) forClassifierinchtest_classifiers:Print '*******************%s ********************'%classifier Start_time=time.time () model=Classifiers[classifier] (train_x, train_y)Print 'Training took%fs!'% (Time.time ()-start_time) Predict=model.predict (test_x)ifModel_save_file! =None:model_save[classifier]=Modelifis_binary_class:precision=Metrics.precision_score (test_y, predict) recall=Metrics.recall_score (test_y, predict)Print 'Precision:%.2f%%, recall:%.2f%%'% (precision, 100 *recall) Accuracy=Metrics.accuracy_score (test_y, predict)Print 'accuracy:%.2f%%'% (100 *accuracy)ifModel_save_file! =None:pickle.dump (model_save, open (Model_save_file,'WB'))
Results:
Reading training andTesting Data ...Data Info *********************#training data:50000, #testing_data: 10000, dimension:784NB ********************training took 0.558000s!accuracy:83.69%******************* KNN ********************Training took29. 467000s!accuracy:96.64%******************* LR ********************Training took104. 605000s!accuracy:91.98%******************* RF ********************Training took4. 401000s!accuracy:93.91%******************* DT ********************Training took26. 923000s!accuracy:87.07%******************* SVM ********************Training took3831. 564000s!accuracy:94.35%******************* GBDT ********************
In this data set, because the cluster of data distribution is better (if you understand this database, see its T-sne map can be seen.) Since the task is simple, it has been considered a toy dataset in the deep learning boundary, so KNN has a good effect. GBDT is a very good algorithm, in Kaggle and other big Data competition, the top tan Hua runner of the column can often see its figure.
Python Machine Learning Library Sciki-earn Practice