From math import log# calculates information entropy def calcshannonent (DataSet) with decision as standard: NumEntries = Len (DataSet) Labelcounts = {} for feat Vec in Dataset:currentlabel = featvec[-1] If CurrentLabel not in Labelcounts.keys (): labelcounts [CurrentLabel] = 0 Labelcounts[currentlabel] + = 1 shannonent = 0.0 for key in Labelcounts:prob = Floa T (Labelcounts[key])/numentries shannonent-= Prob *log (prob,2) return shannonentdef creatdataset (): DataSet = [[],[1,0, ' yes '],[1,1, ' yes ', ' no '],[0,1, ' no '],[0,1, ' no ']] labels = [' No surfacing ', ' flippers '] return Dataset,labe Lsdef Splitdataset (dataset,axis,value): Retdataset = [] #根据特征新建链表 for Featvec in dataset:if featvec[axis] = = Value:reducedfeatvec = Featvec[:axis] Reducedfeatvec.extend (featvec[axis+1:]) retdataset . Append (Reducedfeatvec) return retdataset# best feature to select minimum Information entropy def choosebestfeaturetosplit (dataSet): Numfeatures = Len (DataS ET[0])-1 baseentropy= Calcshannonent (dataSet) #比较标准为最末尾的特征 bestinfogain =0.0;bestfeature =-1 for I in Range (numfeatures): featlist = [Example[i] For example in DataSet] #建立特征i的链表 uniquevals = set (featlist) #set不允许重复 newentropy =0.0 F or value in Uniquevals:subdataset = Splitdataset (dataset,i,value) prob = Len (subdataset)/float (Len ( DataSet)) #计算满足特征i为value的概率 newentropy + = prob*calcshannonent (subdataset) Infogain = baseentropy-newent ropy if (Infogain > Bestinfogain): bestinfogain = Infogain bestfeature = i return BESTFE Ature#def majoritycnt (classlist): ClassCount ={} for vote in Classlist:if vote not in Classcount.keys (): Clas Scount[vote] = 0 Classcount[vote] + = 1 Sortedclasscount = sorted (Classcount.iteritems (), key = Operator.itemgette R (1), reverse = True) return sortedclasscount[0][0] #构建递归决策树def createtree (dataset,labels): Classlist = [example[-1] F or example in DataSet] If Classlist.count (Classlist[0]) ==len (classlist): #如果决策全相同, stop split return classlist[0] If Len (dataset[0]) ==1: #没有特 Return majoritycnt (classlist) bestfeat = Choosebestfeaturetosplit (dataSet) #选择最佳的特征的下坐标 Bestfeatl Abel = Labels[bestfeat] Mytree = {bestfeatlabel:{}} del (Labels[bestfeat]) #删除这个特征 featvalues = [Example[bestfeat] For example in DataSet] uniquevals = set (featvalues) #最佳特征的值set for value in uniquevals:sublabels = labels[:] Mytree[bestfeatlabel][value] = Createtree (Splitdataset (dataset,bestfeat,value), sublabels) return MyTree
Machine learning Combat--decision tree