Python Machine Learning Library sciki-earn practice, pythonsciki-earn

Source: Internet
Author: User
Tags svm

Python Machine Learning Library sciki-earn practice, pythonsciki-earn

Use Anaconda's spyder: Create train_test.py

#!usr/bin/env python  #-*- coding: utf-8 -*-    import sys  import os  import time  from sklearn import metrics  import numpy as np  import cPickle as pickle    reload(sys)  sys.setdefaultencoding('utf8')    # Multinomial Naive Bayes Classifier  def naive_bayes_classifier(train_x, train_y):      from sklearn.naive_bayes import MultinomialNB      model = MultinomialNB(alpha=0.01)      model.fit(train_x, train_y)      return model      # KNN Classifier  def knn_classifier(train_x, train_y):      from sklearn.neighbors import KNeighborsClassifier      model = KNeighborsClassifier()      model.fit(train_x, train_y)      return model      # Logistic Regression Classifier  def logistic_regression_classifier(train_x, train_y):      from sklearn.linear_model import LogisticRegression      model = LogisticRegression(penalty='l2')      model.fit(train_x, train_y)      return model      # Random Forest Classifier  def random_forest_classifier(train_x, train_y):      from sklearn.ensemble import RandomForestClassifier      model = RandomForestClassifier(n_estimators=8)      model.fit(train_x, train_y)      return model      # Decision Tree Classifier  def decision_tree_classifier(train_x, train_y):      from sklearn import tree      model = tree.DecisionTreeClassifier()      model.fit(train_x, train_y)      return model      # GBDT(Gradient Boosting Decision Tree) Classifier  def gradient_boosting_classifier(train_x, train_y):      from sklearn.ensemble import GradientBoostingClassifier      model = GradientBoostingClassifier(n_estimators=200)      model.fit(train_x, train_y)      return model      # SVM Classifier  def svm_classifier(train_x, train_y):      from sklearn.svm import SVC      model = SVC(kernel='rbf', probability=True)      model.fit(train_x, train_y)      return model    # SVM Classifier using cross validation  def svm_cross_validation(train_x, train_y):      from sklearn.grid_search import GridSearchCV      from sklearn.svm import SVC      model = SVC(kernel='rbf', probability=True)      param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}      grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)      grid_search.fit(train_x, train_y)      best_parameters = grid_search.best_estimator_.get_params()      for para, val in best_parameters.items():          print para, val      model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)      model.fit(train_x, train_y)      return model    def read_data(data_file):      import gzip      f = gzip.open(data_file, "rb")      train, val, test = pickle.load(f)      f.close()      train_x = train[0]      train_y = train[1]      test_x = test[0]      test_y = test[1]      return train_x, train_y, test_x, test_y        if __name__ == '__main__':      data_file = "mnist.pkl.gz"      thresh = 0.5      model_save_file = None      model_save = {}            test_classifiers = ['NB', 'KNN', 'LR', 'RF', 'DT', 'SVM', 'GBDT']      classifiers = {'NB':naive_bayes_classifier,                     'KNN':knn_classifier,                     'LR':logistic_regression_classifier,                     'RF':random_forest_classifier,                     'DT':decision_tree_classifier,                    'SVM':svm_classifier,                  'SVMCV':svm_cross_validation,                   'GBDT':gradient_boosting_classifier      }            print 'reading training and testing data...'      train_x, train_y, test_x, test_y = read_data(data_file)      num_train, num_feat = train_x.shape      num_test, num_feat = test_x.shape      is_binary_class = (len(np.unique(train_y)) == 2)      print '******************** Data Info *********************'      print '#training data: %d, #testing_data: %d, dimension: %d' % (num_train, num_test, num_feat)            for classifier in test_classifiers:          print '******************* %s ********************' % classifier          start_time = time.time()          model = classifiers[classifier](train_x, train_y)          print 'training took %fs!' % (time.time() - start_time)          predict = model.predict(test_x)          if model_save_file != None:              model_save[classifier] = model          if is_binary_class:              precision = metrics.precision_score(test_y, predict)              recall = metrics.recall_score(test_y, predict)              print 'precision: %.2f%%, recall: %.2f%%' % (100 * precision, 100 * recall)          accuracy = metrics.accuracy_score(test_y, predict)          print 'accuracy: %.2f%%' % (100 * accuracy)         if model_save_file != None:          pickle.dump(model_save, open(model_save_file, 'wb'))  

Result:

reading training and testing data...******************** Data Info *********************#training data: 50000, #testing_data: 10000, dimension: 784******************* NB ********************training took 0.558000s!accuracy: 83.69%******************* KNN ********************training took 29.467000s!accuracy: 96.64%******************* LR ********************training took 104.605000s!accuracy: 91.98%******************* RF ********************training took 4.401000s!accuracy: 93.91%******************* DT ********************training took 26.923000s!accuracy: 87.07%******************* SVM ********************training took 3831.564000s!accuracy: 94.35%******************* GBDT ********************

In this data set, the data distribution cluster is good (if you know about this database, you can see it by looking at its t-SNE ing diagram. Because the task is simple, it has been regarded as a toy dataset in the deep learning field. Therefore, KNN does not work well. GBDT is a very good algorithm. In Big Data competitions such as kaggle, it is often seen on the top of the list.

 

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.