"Machine Learning Combat" Chapter III decision Tree
-------------------------------------
#1 trees.py to calculate Shannon entropy for a given data set
-------------------------------------
1 fromMathImportLog2 3 #calculate Shannon entropy for a given data set4 defcalcshannonent (dataSet):5Numenres =Len (dataSet)6Labelcoounts = {}7 forFeatvecinchDataSet:8 #Create a dictionary for all possible classifications9CurrentLabel = featvec[-1]Ten ifCurrentLabel not inchLabelcoounts.keys (): OneLabelcoounts[currentlabel] =0 ALabelcoounts[currentlabel] + = 1 -Shannonent = 0.0 - forKeyinchlabelcoounts: theProb = float (Labelcoounts[key])/Numenres -Shannonent-= prob * log (prob, 2)#The logarithm is calculated with 2 as the base - returnshannonent - + #used to get a simple fish identification data set - defCreateDataSet (): +DataSet = [[1, 1,'Yes'], A[1, 1,'Yes'], at[1, 0,'No'], -[0, 1,'No'], -[0, 1,'No']] -Labels = ['No surfacing','Flippers'] - returnDataSet, Labels
-------------------------------------
#2 trees.py The data set to be divided by the dataset, the need to divide the data set, the value of the feature to be returned
-------------------------------------
1 #Dividing data sets to be divided into datasets, dividing data sets for signatures, and the values of features to be returned2 defSplitdataset (dataSet, axis, value):3Retdataset = []4 forFeatvecinchDataSet:5 ifFeatvec[axis] = =Value:6Reducedfeatvec =Featvec[:axis]7Reducedfeatvec.extend (Featvec[axis + 1:])8 retdataset.append (Reducedfeatvec)9 returnRetdataset
-------------------------------------
#3 trees.py Choose the best way to divide your dataset
-------------------------------------
1 #Dividing data sets to be divided into datasets, dividing data sets for signatures, and the values of features to be returned2 defSplitdataset (dataSet, axis, value):3Retdataset = []4 forFeatvecinchDataSet:5 ifFeatvec[axis] = =Value:6Reducedfeatvec =Featvec[:axis]7Reducedfeatvec.extend (Featvec[axis + 1:])8 retdataset.append (Reducedfeatvec)9 returnRetdatasetTen One A #Choose the best way to partition your data sets - defChoosebestfeaturetosplit (dataSet): -Numfeatures = Len (dataset[0])-1 theBaseentropy =calcshannonent (DataSet) -Bestinfogain = 0.0; -Bestfeature =-1; - forIinchRange (numfeatures): +Featlist = [Example[i] forExampleinchDataSet] -Uniquevals =Set (featlist) +Newentropy = 0.0; A at forValueinchuniquevals: -Subdataset =Splitdataset (DataSet, I, value) -Prob = Len (subdataset)/float (len (dataSet)) -Newentropy + = prob *calcshannonent (subdataset) - -Infogain = baseentropy-newentropy in - if(Infogain >bestinfogain): toBestinfogain =Infogain +Bestfeature =I - the returnBestfeature
-------------------------------------
#4 trees.py to create a tree function code two parameters: DataSet, label list
-------------------------------------
1 Importoperator2 3 #function code for creating a tree two parameters: DataSet, label list4 defcreatetree (DataSet, labels):5Classlist = [Example[-1] forExampleinchDataSet]6 7 #the categories are exactly the same and stop dividing .8 ifClasslist.count (classlist[0]) = =Len (classlist):9 returnClasslist[0]Ten One #returns the largest number of occurrences when all features have been traversed A ifLen (dataset[0]) = = 1: - returnmajoritycnt (classlist) - theBestfeat =choosebestfeaturetosplit (DataSet) -Bestfeatlabel =Labels[bestfeat] -Mytree ={bestfeatlabel: {}} - del(Labels[bestfeat]) + - #get all the property values that the list contains +Featvalues = [Example[bestfeat] forExampleinchDataSet] AUniquevals =Set (featvalues) at - #iterates over all attribute values contained in the current selection feature, recursively calls the function Createtree () on each dataset partition - forValueinchuniquevals: -Sublabels =labels[:] -Mytree[bestfeatlabel][value] =Createtree (Splitdataset (DataSet, bestfeat, value), sublabels) - in returnMytree
[Python] Decision Tree