ID3 algorithm
#coding =utf-8from math import log import operator # This defines a sample set Def createdataset (): dataset = [[1, 1, ' Yes '], [1 , 1, ' yes '], [1, 0, ' No '], [0, 1, ' No '], [0, 1, ' No ']] labels = [' no surfacing ', ' flippers '] #change to discrete values return dataSet, labels #这里计算该样本的期望值def calCshannonent (DataSet): numentries = len (DataSet) labelCounts = {} for featVec in dataset: #the the number of unique elements and their occurance currentLabel = featVec[-1] if currentlabel not in labelcounts.keys (): labelCounts[currentLabel] = 0 labelcounts[currentlabel] += 1 shannonent = 0.0 for key in labelCounts: prob = float (Labelcounts[key])/numentries shAnnonent -= prob * log (prob,2) #log base 2 return shannonEnt #这里计算数据集中某列满足相同条件下的其他数值组成的矩阵, The expected value Def splitdataset (dataset, axis, value) for calculating the eigenvalue: retdataset = [] for featvec in dataset: if featVec[axis] == value: reducedfeatvec = featvec[: axis] #chop out axis used for splitting reducedfeatvec.extend (featVec[axis+1:] ) retdataset.append ( Reducedfeatvec) return retdataset #针对上个函数算出的矩阵, calculates the expected value of all eigenvalues, and then draws the maximum information increment def choosebestfeaturetosplit (DataSet): numfeatures = len ( Dataset[0]) - 1 #the last column is used for the labels baseentropy = calcshannonent ( DataSet) bestinfogain = 0.0; bestfeature = -1 for i in range (numfeatures): #iterate over all the features featlist = [example[i] for example in dataset] #create a list of all the examples of this feature uniqueVals = set (featlist) #get a set of unique values newentropy = 0.0 for value in uniqueVals: #这里选取某一特征值得所有可能计算期望值 subdataset = splitdataset (DataSet, i, value) prob = len (Subdataset)/float (len (dataSet)) newentropy += prob * calcshannonent (Subdataset) infogain = baseentropy - newentropy #calculate the info gain; ie reduction in entropy if (Infogain > bestinfogain): #compare this to the best gain so far bestInfoGain = infogain #if better than current best , set to best bestFeature = i return bestFeature #returns an integer #如果最后的结果有多个, voting mechanism to take the biggest def MAJORITYCNT (classlist):    &Nbsp; classcount={} for vote in classlist: if vote not in classcount.keys (): classcount[vote] = 0 classcount[ Vote] += 1 sortedclasscount = sorted ( Classcount.iteritems (), key=operator.itemgetter (1), reverse=true) return sortedClassCount[0][0] #创建树, using recursive Def createtree (dataset,labels): #计算dataset里是否只有单值 classlist = [example[-1 ] for example in dataset] #如果只有单值, and the only , a single value of if classlist.count (Classlist[0]) == len (classList): is returned return classlist[0] # If it is the final result, but has multiple values, take the most if len (Dataset[0]) == 1:     RETURN MAJORITYCNT (classlist) #取最佳的特征值 bestFeat = Choosebestfeaturetosplit (DataSet) bestfeatlabel = labels[bestfeat ] #根据这个特征值制定字典 mytree = {bestFeatLabel:{}} #删除这个特征值 del (Labels[bestfeat]) #找出这个特征值下有几个选择 featValues = [example[bestFeat] for example in dataSet] uniquevals&nBsp;= set (featvalues) for value in uniqueVals: subLabels = labels[:] #针对每个选择, establishing different branches mytree[bestfeatlabel][value] = createtree (SplitDataSet (dataSet, Bestfeat, value, Sublabels) return myTree #决策树分类器, Inputtree is a decision tree, Fearlabel is the column type, Testvec is the vector to classify def classify (Inputtree,featlabels,testvec): firststr = inputtree.keys () [0] seconddict = Inputtree[firststr] &nbsP; featindex = featlabels.index (FIRSTSTR) key = testVec[featIndex] valueOfFeat = secondDict[key] if isinstance (valueoffeat, dict): classlabel = classify (Valueoffeat, featlabels, testvec) else: classLabel = valueOfFeat return classLabel #序列化决策树def storetree (inputtree,filename): import pickle fw = open ( FileName, ' W ') pickle.dump (INPUTTREE,FW) Fw.close () #提取决策树 def grabtree ( FileName): &nbsP; import pickle fr = open (filename) return pickle.load (FR)
The input matrix is a table and the last column is the result.
ID3 algorithm is a rough algorithm, it can classify the symbolic variables, but it can't analyze the numerical data.
In addition, when selecting eigenvalues, it is preferred to choose more kinds of eigenvalues, so we need to improve the values.
Learning Log---Decision Tree algorithm ID3