機器學習:wine 分類

來源:互聯網
上載者:User

標籤:col   最佳化   numpy   square   sub   show   gis   models   its   

資料來源:http://archive.ics.uci.edu/ml/datasets/Wine

參考文獻:《機器學習Python實戰》魏貞原

博文目的:複習

工具:Geany

#匯入類庫

from pandas import read_csv                                    #讀資料
from pandas.plotting import scatter_matrix            #畫散佈圖
from pandas import set_option                                #設定列印資料精確度

import numpy as np

import matplotlib.pyplot as plt                                #畫圖

from sklearn.preprocessing import Normalizer            #資料預先處理:歸一化
from sklearn.preprocessing import StandardScaler     #資料預先處理:正態化

from sklearn.preprocessing import MinMaxScaler      #資料預先處理:調整資料尺度

from sklearn.model_selection import train_test_split        #分離資料集
from sklearn.model_selection import cross_val_score       #計算演算法準確度
from sklearn.model_selection import KFold                        #交叉驗證
from sklearn.model_selection import GridSearchCV        #機器學習演算法的參數最佳化方法:網格最佳化法

from sklearn.linear_model import LinearRegression        #線性迴歸
from sklearn.linear_model import Lasso                           #套索迴歸
from sklearn.linear_model import ElasticNet                    #彈性網路迴歸
from sklearn.linear_model import LogisticRegression     #羅吉斯迴歸演算法

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis            #線性判別分析
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis      #二次判別分析
from sklearn.tree import DecisionTreeRegressor        #決策樹迴歸                                       
from sklearn.tree import DecisionTreeClassifier         #決策樹分類

from sklearn.neighbors import KNeighborsRegressor    #KNN迴歸

from sklearn.neighbors import KNeighborsClassifier        #KNN分類

from sklearn.naive_bayes import GaussianNB    #貝葉斯分類器

from sklearn.svm import SVR    #支援向量機 迴歸
from sklearn.svm import SVC    #支援向量機 分類

from sklearn.pipeline import Pipeline    #pipeline能夠將從資料轉換到評估模型的整個機器學習流程進行自動化處理

from sklearn.ensemble import RandomForestRegressor        #隨即森林迴歸
from sklearn.ensemble import RandomForestClassifier        #隨即森林分類
from sklearn.ensemble import GradientBoostingRegressor    #隨即梯度上升迴歸
from sklearn.ensemble import GradientBoostingClassifier    #隨機梯度上分類
from sklearn.ensemble import ExtraTreesRegressor        #極端樹迴歸
from sklearn.ensemble import ExtraTreesClassifier        #極端樹分類
from sklearn.ensemble import AdaBoostRegressor        #AdaBoost迴歸
from sklearn.ensemble import AdaBoostClassifier        #AdaBoost分類

from sklearn.metrics import mean_squared_error        #
from sklearn.metrics import accuracy_score                #分類準確率

from sklearn.metrics import confusion_matrix        #混淆矩陣

from sklearn.metrics import classification_report    #分類報告


#匯入資料
filename = 'wine.csv'
data = read_csv(filename, header=None, delimiter=',')
#資料理解
print(data.shape)
#print(data.dtypes)
#print(data.corr(method='pearson'))
#print(data.describe())
#print(data.groupby(0).size())


#資料視覺效果:長條圖、散佈圖、密度圖、關係矩陣圖

#長條圖

#data.hist()
#plt.show()


#密度圖

#data.plot(kind='density', subplots=True, layout=(4,4), sharex=False, sharey=False)
#plt.show()


#散佈圖

#scatter_matrix(data)
#plt.show()


#關係矩陣圖

#fig = plt.figure()
#ax = fig.add_subplot(111)
#cax = ax.matshow(data.corr(), vmin=-1, vmax=1)
#fig.colorbar(cax)
#plt.show()



#資料處理:調整資料尺度、歸一化、正態化、二值化
array = data.values
X = array[:, 1:14].astype(float)
Y = array[:,0]

scaler = MinMaxScaler(feature_range=(0,1)).fit(X)
X_m = scaler.transform(X)

scaler = Normalizer().fit(X)
X_n = scaler.transform(X)

scaler = StandardScaler().fit(X)
X_s = scaler.transform(X)

#分離資料集
validation_size = 0.2
seed = 7

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)

X_m_train, X_m_test, Y_m_train, Y_m_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)

X_n_train, X_n_test, Y_n_train, Y_n_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)

X_s_train, X_s_test, Y_s_train, Y_s_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)

#選擇模型:(本例是一個分類問題)
#非線性:KNN, SVC, CART, GaussianNB,
#線性:KNN, SVR, LR, Lasso, ElasticNet,  LDA,
models = {}
models['KNN'] = KNeighborsClassifier()
models['SVM'] = SVC()
models['CART'] = DecisionTreeClassifier()
models['GN'] = GaussianNB()
#models['LR'] = LinearRegression()
#models['Lasso'] = Lasso()
#models['EN'] = ElasticNet()
models['LDA'] = LinearDiscriminantAnalysis()
models['QDA'] = QuadraticDiscriminantAnalysis()

#評估模型
scoring = 'accuracy'
num_folds = 10
seed = 7

results = []
for key in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results =cross_val_score(models[key], X_train, Y_train, scoring=scoring, cv=kfold)
    results.append(cv_results)
    print('%s %f(%f)'%(key, cv_results.mean(), cv_results.std()))

results_m = []
for key in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results_m =cross_val_score(models[key], X_m_train, Y_m_train, scoring=scoring, cv=kfold)
    results_m.append(cv_results_m)
    print('調整資料尺度:%s %f(%f)'%(key, cv_results_m.mean(), cv_results_m.std()))

results_n = []
for key in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results_n =cross_val_score(models[key], X_n_train, Y_n_train, scoring=scoring, cv=kfold)
    results_n.append(cv_results_n)
    print('歸一化資料:%s %f(%f)'%(key, cv_results_n.mean(), cv_results_n.std()))    

results_s = []
for key in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results_s =cross_val_score(models[key], X_s_train, Y_s_train, scoring=scoring, cv=kfold)
    results_s.append(cv_results_s)
    print('正態化資料:%s %f(%f)'%(key, cv_results_s.mean(), cv_results_s.std()))
#盒狀圖

#演算法最佳化:LDA
param_grid = {'solver':['svd', 'lsqr', 'eigen']}
model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train, y=Y_train)
print('最優:%s 使用:%s'%(grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'], grid_result.cv_results_['std_test_score'], grid_result.cv_results_['params'])
for mean, std, params in cv_results:
    print('%f(%f) with %r'%(mean, std, params))


#演算法整合
#bagging: 隨機森林,極限樹;
#boosting:ada, 隨機梯度上升
ensembles = {}
ensembles['RF'] = RandomForestClassifier()
ensembles['ET'] = ExtraTreesClassifier()
ensembles['ADA'] = AdaBoostClassifier()
ensembles['GBM'] = GradientBoostingClassifier()

results = []
for key in ensembles:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results =cross_val_score(ensembles[key], X_train, Y_train, scoring=scoring, cv=kfold)
    results.append(cv_results)
    print('%s %f(%f)'%(key, cv_results.mean(), cv_results.std()))

#整合演算法調參gbm
param_grid = {'n_estimators':[10,50,100,200,300,400,500,600,700,800,900]}
model = GradientBoostingClassifier()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=kfold, scoring=scoring)
grid_result = grid.fit(X=X_train, y=Y_train)
print('最優:%s 使用:%s'%(grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'], grid_result.cv_results_['std_test_score'], grid_result.cv_results_['params'])
for mean, std, params in cv_results:
    print('%f(%f) with %r'%(mean, std, params))

#訓練最終模型
model = LinearDiscriminantAnalysis(solver='svd')
model.fit(X=X_train, y=Y_train)

#評估最終模型
predictions = model.predict(X_test)
print(accuracy_score(Y_test, predictions))
print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))



機器學習:wine 分類

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.