Data Source: Http://archive.ics.uci.edu/ml/datasets/Wine
Reference: "Machine learning Python Combat" Wei original
Purpose of the blog: review
Tool: Geany
#导入类库
From pandas import Read_csv #读数据
From pandas.plotting import Scatter_matrix #画散点图
From pandas import set_option #设置打印数据精确度
Import NumPy as NP
Import Matplotlib.pyplot as Plt #画图
From sklearn.preprocessing import normalizer #数据预处理: Normalization
From sklearn.preprocessing import Standardscaler #数据预处理: Normal
From sklearn.preprocessing import Minmaxscaler #数据预处理: Adjusting the data scale
From sklearn.model_selection import Train_test_split #分离数据集
From sklearn.model_selection import Cross_val_score #计算算法准确度
From sklearn.model_selection import Kfold #交叉验证
From sklearn.model_selection import GRIDSEARCHCV #机器学习算法的参数优化方法: Mesh Optimization method
From Sklearn.linear_model import linearregression #线性回归
From Sklearn.linear_model import Lasso #套索回归
From Sklearn.linear_model import elasticnet #弹性网络回归
From Sklearn.linear_model import logisticregression #逻辑回归算法
From sklearn.discriminant_analysis import lineardiscriminantanalysis #线性判别分析
From sklearn.discriminant_analysis import quadraticdiscriminantanalysis #二次判别分析
From Sklearn.tree import Decisiontreeregressor #决策树回归
From Sklearn.tree import Decisiontreeclassifier #决策树分类
From sklearn.neighbors import Kneighborsregressor #KNN回归
From sklearn.neighbors import Kneighborsclassifier #KNN分类
From Sklearn.naive_bayes import GAUSSIANNB #贝叶斯分类器
From SKLEARN.SVM import SVR #支持向量机 regression
From SKLEARN.SVM import SVC #支持向量机 classification
From Sklearn.pipeline Import Pipeline #pipeline能够将从数据转换到评估模型的整个机器学习流程进行自动化处理
From sklearn.ensemble import Randomforestregressor #随即森林回归
From sklearn.ensemble import Randomforestclassifier #随即森林分类
From sklearn.ensemble import Gradientboostingregressor #随即梯度上升回归
From sklearn.ensemble import Gradientboostingclassifier #随机梯度上分类
From sklearn.ensemble import Extratreesregressor #极端树回归
From sklearn.ensemble import Extratreesclassifier #极端树分类
From sklearn.ensemble import Adaboostregressor #AdaBoost回归
From sklearn.ensemble import Adaboostclassifier #AdaBoost分类
From sklearn.metrics Import Mean_squared_error #
From sklearn.metrics import Accuracy_score #分类准确率
From sklearn.metrics import Confusion_matrix # Confusion Matrix
From sklearn.metrics import Classification_report #分类报告
#导入数据
filename = ' wine.csv '
data = read_csv (filename, header=none, delimiter= ', ')
#数据理解
Print (Data.shape)
#print (Data.dtypes)
#print (Data.corr (method= ' Pearson '))
#print (Data.describe ())
#print (data.groupby (0). Size ())
#数据可视化: histograms, scatter plots, density plots, relational matrix graphs
#直方图
#data. hist ()
#plt. Show ()
#密度图
#data. Plot (kind= ' density ', subplots=true, layout= (bis), Sharex=false, Sharey=false)
#plt. Show ()
#散点图
#scatter_matrix (data)
#plt. Show ()
#关系矩阵图
#fig = Plt.figure ()
#ax = Fig.add_subplot (111)
#cax = Ax.matshow (Data.corr (), Vmin=-1, Vmax=1)
#fig. Colorbar (CAX)
#plt. Show ()
#数据处理: Adjusting data scale, normalization, normality, and binary value
Array = data.values
X = array[:, 1:14].astype (float)
Y = array[:,0]
Scaler = Minmaxscaler (feature_range= (0,1)). Fit (X)
X_m = Scaler.transform (X)
Scaler = Normalizer (). Fit (X)
X_n = Scaler.transform (X)
Scaler = Standardscaler (). Fit (X)
x_s = Scaler.transform (X)
#分离数据集
Validation_size = 0.2
Seed = 7
X_train, X_test, y_train, y_test = Train_test_split (X, Y, Test_size=validation_size, Random_state=seed)
X_m_train, X_m_test, y_m_train, y_m_test = Train_test_split (X, Y, Test_size=validation_size, Random_state=seed)
X_n_train, X_n_test, y_n_train, y_n_test = Train_test_split (X, Y, Test_size=validation_size, Random_state=seed)
X_s_train, X_s_test, y_s_train, y_s_test = Train_test_split (X, Y, Test_size=validation_size, Random_state=seed)
#选择模型: (This example is a classification problem)
#非线性: KNN, SVC, CART, GAUSSIANNB,
#线性: KNN, SVR, LR, Lasso, Elasticnet, LDA,
Models = {}
models[' KNN '] = Kneighborsclassifier ()
models[' SVM '] = SVC ()
models[' CART '] = Decisiontreeclassifier ()
models[' GN '] = GAUSSIANNB ()
#models [' LR '] = linearregression ()
#models [' Lasso '] = Lasso ()
#models [' EN '] = elasticnet ()
models[' LDA '] = lineardiscriminantanalysis ()
models[' QDA '] = quadraticdiscriminantanalysis ()
#评估模型
scoring = ' accuracy '
Num_folds = 10
Seed = 7
results = []
For key in models:
Kfold = Kfold (N_splits=num_folds, Random_state=seed)
Cv_results =cross_val_score (Models[key], X_train, Y_train, scoring=scoring, Cv=kfold)
Results.append (Cv_results)
Print ('%s%f (%f) '% (key, Cv_results.mean (), CV_RESULTS.STD ()))
Results_m = []
For key in models:
Kfold = Kfold (N_splits=num_folds, Random_state=seed)
Cv_results_m =cross_val_score (Models[key], X_m_train, Y_m_train, scoring=scoring, Cv=kfold)
Results_m.append (Cv_results_m)
Print (' Scaling data scale:%s%f (%f) '% (key, Cv_results_m.mean (), CV_RESULTS_M.STD ()))
Results_n = []
For key in models:
Kfold = Kfold (N_splits=num_folds, Random_state=seed)
Cv_results_n =cross_val_score (Models[key], X_n_train, Y_n_train, scoring=scoring, Cv=kfold)
Results_n.append (Cv_results_n)
Print (' Normalized data:%s%f (%f) '% (key, Cv_results_n.mean (), CV_RESULTS_N.STD ()))
results_s = []
For key in models:
Kfold = Kfold (N_splits=num_folds, Random_state=seed)
cv_results_s =cross_val_score (Models[key], X_s_train, Y_s_train, scoring=scoring, Cv=kfold)
Results_s.append (cv_results_s)
Print (' Normal data:%s%f (%f) '% (key, Cv_results_s.mean (), CV_RESULTS_S.STD ()))
#箱线图
#算法优化: LDA
Param_grid = {' Solver ': [' SVD ', ' lsqr ', ' Eigen ']}
Model = Lineardiscriminantanalysis ()
Kfold = Kfold (N_splits=num_folds, Random_state=seed)
Grid = GRIDSEARCHCV (Estimator=model, Param_grid=param_grid, scoring=scoring, Cv=kfold)
Grid_result = Grid.fit (X=x_train, Y=y_train)
Print (' Best:%s ' using:%s '% (Grid_result.best_score_, Grid_result.best_params_))
Cv_results = Zip (grid_result.cv_results_[' mean_test_score '), grid_result.cv_results_[' Std_test_score '], grid_ result.cv_results_[' params '])
For mean, STD, params in cv_results:
Print ('%f (%f) with%r '% (mean, STD, params))
#算法集成
#bagging: Random forest, limit tree;
#boosting: Ada, random gradient rise
Ensembles = {}
ensembles[' RF '] = Randomforestclassifier ()
ensembles[' ET '] = Extratreesclassifier ()
ensembles[' ADA ' = Adaboostclassifier ()
ensembles[' GBM '] = Gradientboostingclassifier ()
results = []
For key in ensembles:
Kfold = Kfold (N_splits=num_folds, Random_state=seed)
Cv_results =cross_val_score (Ensembles[key], X_train, Y_train, scoring=scoring, Cv=kfold)
Results.append (Cv_results)
Print ('%s%f (%f) '% (key, Cv_results.mean (), CV_RESULTS.STD ()))
#集成算法调参gbm
Param_grid = {' N_estimators ': [10,50,100,200,300,400,500,600,700,800,900]}
Model = Gradientboostingclassifier ()
Kfold = Kfold (N_splits=num_folds, Random_state=seed)
Grid = GRIDSEARCHCV (Estimator=model, Param_grid=param_grid, Cv=kfold, scoring=scoring)
Grid_result = Grid.fit (X=x_train, Y=y_train)
Print (' Best:%s ' using:%s '% (Grid_result.best_score_, Grid_result.best_params_))
Cv_results = Zip (grid_result.cv_results_[' mean_test_score '), grid_result.cv_results_[' Std_test_score '], grid_ result.cv_results_[' params '])
For mean, STD, params in cv_results:
Print ('%f (%f) with%r '% (mean, STD, params))
#训练最终模型
Model = lineardiscriminantanalysis (solver= ' SVD ')
Model.fit (X=x_train, Y=y_train)
#评估最终模型
predictions = Model.predict (x_test)
Print (Accuracy_score (y_test, predictions))
Print (Confusion_matrix (y_test, predictions))
Print (Classification_report (y_test, predictions))
Machine Learning: Wine classification