# Decision Tree
Import pandas as Pdfrom sklearn.tree import decisiontreeclassifierfrom sklearn.cross_validation import Train_test_ Splitfrom sklearn.metrics Import classification_reportfrom sklearn.pipeline import Pipelinefrom Sklearn.grid_search Import GRIDSEARCHCV
Import zipfile# compression saves space z=zipfile. ZipFile (' Ad-dataset.zip ') # df=pd.read_csv (Z.open (z.namelist () [0]), header=none,low_memory=false) # df = Pd.read_csv ( Z.open (z.namelist () [0]), Header=none, Low_memory=false)
Df=pd.read_csv ('. \\tree_data\\ad.data ', Header=none) Explanatory_variable_columns=set (df.columns.values) response _variable_column=df[len (Df.columns.values)-1] #最后一列是代表的标签类型explanatory_variable_columns. Remove (len (df.columns)- 1)
y=[1 if E = = ' ad. ' Else 0 for E in Response_variable_column]x=df.loc[:,list (Explanatory_variable_columns)]
#匹配? characters, and convert the value to -1x.replace (to_replace= ' *\? ', Value=-1, Regex=true, Inplace=true)
X_train,x_test,y_train,y_test=train_test_split (x, y) #用信息增益启发式算法建立决策树pipeline =pipeline ([' CLF ', Decisiontreeclassifier (criterion= ' entropy ')]) parameters = {' clf__max_depth ': [155], Clf__min_samples_ Split ': (1, 2, 3), ' Clf__min_samples_leaf ': (1, 2, 3)} #f1查全率和查准率的调和平均grid_search =GRIDSEARCHCV (pipeline,parameters,n_ Jobs=-1, verbose=1,scoring= ' F1 ') grid_search.fit (x_train,y_train) print ' best results:%0.3f '%grid_search.best_score_ print ' Optimal parameters ' Best_parameters=grid_search.best_estimator_.get_params () best_parameters
Output Result:
Fitting 3 folds for each of the candidates, totalling Bayi fits
[Parallel (N_jobs=-1)]: Done to tasks | elapsed: 21.0s[parallel (n_jobs=-1)]: Done Bayi out of 81 | Elapsed: 34.7s finished
Best results: 0.888 optimal parameters
OUT[123]:
{' CLF ': Decisiontreeclassifier (Class_weight=none, criterion= ' entropy ', max_depth=160, Max_features=none, Max_ Leaf_nodes=none, Min_samples_leaf=1, min_samples_split=3, min_weight_fraction_leaf=0.0, Presort=False, Random_state=none, splitter= ' best '), ' clf__class_weight ': None, ' clf__criterion ': ' Entropy ', ' clf__max_depth ': 160, ' Clf__max_features ': None, ' clf__max_leaf_nodes ': None, ' clf__min_samples_leaf ': 1, ' Clf__min_samples_split ': 3, ' clf__ Min_weight_fraction_leaf ': 0.0, ' clf__presort ': False, ' clf__random_state ': None, ' clf__splitter ': ' Best ', ' Steps ': [(' CLF ', decisiontreeclassifier (class_weight=none, criterion= ' entropy ', max_depth=160, Max_features=none, Max_leaf_nodes=none, Min_samples_leaf=1, min_samples_split=3, min_weight_fraction_leaf=0.0, presort= False, Random_state=none, splitter= ' best ')]}
For Param_name in Sorted (Parameters.keys ()): print (' \t%s:%r '% (Param_name,best_parameters[param_name])) Predictions=grid_search.predict (x_test) print Classification_report (y_test,predictions)
Output Result:
clf__max_depth:150
Clf__min_samples_leaf:1
Clf__min_samples_split:1
Precision Recall F1-score Support
0 0.97 0.99) 0.98 703
1 0.91 0.84) 0.87 117
Avg/total 0.96 0.96 0.96 820
Df.head ()
Output results;
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | 9
|
... |
1549 |
1550 |
1551 |
1552 |
1553 |
1554 |
1555 |
1556 |
1557 |
1558 |
0 |
125 |
125 |
1.0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Ad. |
1 |
57 |
468 |
8.2105 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Ad. |
2 |
33 |
230 |
6.9696 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Ad. |
3 |
60 |
468 |
7.8 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Ad. |
4 |
60 |
468 |
7.8 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Ad. |
# Decision Tree Integration
#coding: Utf-8import Pandas as Pdfrom sklearn.ensemble import Randomforestclassifierfrom sklearn.cross_validation Import train_test_splitfrom sklearn.metrics import classification_reportfrom sklearn.pipeline import Pipelinefrom Sklearn.grid_search Import Gridsearchcvdf=pd.read_csv ('. \\tree_data\\ad.data ', Header=none,low_memory=false) Explanatory_variable_columns=set (df.columns.values) Response_variable_column=df[len (df.columns.values)-1]
Df.head ()
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | 9
|
... |
1549 |
1550 |
1551 |
1552 |
1553 |
1554 |
1555 |
1556 |
1557 |
1558 |
0 |
125 |
125 |
1.0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Ad. |
1 |
57 |
468 |
8.2105 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Ad. |
2 |
33 |
230 |
6.9696 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Ad. |
3 |
60 |
468 |
7.8 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Ad. |
4 |
60 |
468 |
7.8 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Ad. |
#The Last column describes the targets (remove the final row) Explanatory_variable_columns.remove (Len (df.columns.values)-1) y=[1 if e= = ' ad. ' Else 0 for E in Response_variable_column]x=df.loc[:,list (explanatory_variable_columns)] #置换有? For -1x.replace (to_replace= ' *\? ', Value=-1, Regex=true, inplace=true) x_train,x_test,y_train,y_test=train_test_split (x, Y) pipeline=pipeline ([' CLF ', randomforestclassifier (criterion= ' entropy ')]) parameters = {' Clf__n_estimators ': ( 5, "Clf__max_depth": (1, 2, 3), ' Clf__min_samples_leaf ': (+ 1, 2, 3), ' Clf__min_samples_split ': D_search = GRIDSEARCHCV (pipeline,parameters,n_jobs=-1,verbose=1,scoring= ' F1 ') Grid_search.fit (X_train,y_train)
Print (U ' best effect:%0.3f '%grid_search.best_score_) print U ' optimal parameters: ' best_parameters=grid_search.best_estimator_.get_ params () for param_name in sorted (Parameters.keys ()): print (' \t%s:%r '% (Param_name,best_parameters[param_name]) )
Output Result:
Best results: 0.929 optimal parameters: clf__max_depth:250 clf__min_samples_leaf:1 clf__min_samples_split:3 clf__n_estimators:50
Predictions=grid_search.predict (x_test) print Classification_report (y_test,predictions)
Output Result:
Precision Recall F1-score Support
0 0.98 1.00) 0.99 705
1 0.97 0.90) 0.93 115
Avg/total 0.98 0.98 0.98 820
Python_sklearn Machine Learning Library Learning notes (iv) Decision_tree (decision tree)