import pandas as pdimport numpy as npfrom sklearn.preprocessing import Imputer#匯入資料預先處理模組處理未經處理資料from sklearn.model_selection import train_test_split#匯入自動產生訓練集和測試集的模組from sklearn.metrics import classification_report#匯入預測結果評估模組from sklearn.neighbors import KNeighborsClassifier#knn近鄰演算法from sklearn.tree import DecisionTreeClassifier#決策樹演算法模組from sklearn.naive_bayes import GaussianNB#貝葉斯演算法模組#資料匯入模組def loadDataSet(feature_paths,label_paths): #建立空數組 feature=np.ndarray(shape=(0,41)) label=np.ndarray(shape=(0,1)) #處理資料 for file in feature_paths: #逗號分隔字元讀取特徵資料,問號替換為缺失值,不讀取表頭。 df = pd.read_table(file, delimiter=',', na_values='?', header=None) #補全函數:指明遺失資料為缺失值,採用平均值補全缺失值。 imp = Imputer(missing_values='NaN', strategy='mean', axis=0) #函數方法調用接頭 imp.fit(df) df=imp.transform(df)#transform會將一個函數應用到各個分組。 #將新的資料合併到特徵集合中 feature = np.concatenate((feature, df)) for file in label_paths: df = pd.read_table(file, header=None) label = np.concatenate((label, df)) #將標籤規整為一維向量。 label = np.ravel(label) return feature, label#主函數模組if __name__ =='__main__': #設定資料標籤塊 feature_paths=['A.feature','B.feature','C.feature','D.feature','E.feature'] label_paths = ['A.label','B.label','C.label','D.label','E.label'] #讀入訓練資料 x_train,y_train=loadDataSet(feature_paths[:4],label_paths[:4]) #讀入測試資料 x_test,y_test=loadDataSet(feature_paths[4:],label_paths[4:]) #使用train_test_split函數打亂訓練資料 x_train,x_,y_train,y_=train_test_split(x_train,y_train,test_size=0.0)#如果test_size=0那麼得到的訓練資料就是完整的未經處理資料,只是打亂了順序。 #建立k鄰近分類器 #列印說明,說明要使用k鄰近器 print('start trainning knn……') knn=KNeighborsClassifier().fit(x_train ,y_train) print('I have already trainning the data you just gave me!') answer_knn=knn.predict(x_test) print('Prediction done!') #建立決策樹分類器 print('start trainning DecisionTreeClassifier……') dt = DecisionTreeClassifier().fit(x_train, y_train) print('I have already trainning the data you just gave me!') answer_dt=dt.predict(x_test) print('Prediction done!') #建立貝葉斯分類器 print('start trainning Bayes……') gnb = GaussianNB().fit(x_train, y_train) print('I have already trainning the data you just gave me!') answer_gnb= gnb.predict(x_test) print('Prediction done!') #評價結果 #classification_report()將對資料從精確率precision,召回率recall,f1—scoref1值,以及支援度support四個維度對資料結果進行評價 print('\n\nThe classification report for knn:') print(classification_report(y_test, answer_knn)) print('\n\nThe classification report for DT:') print(classification_report(y_test, answer_dt)) print('\n\nThe classification report for Bayes:') print(classification_report(y_test, answer_gnb))
註:關於transform函數的使用還不是很明確。