After finally reading the decision tree generation and test code, I still felt very rewarding. So I would like to summarize what is related to the decision tree, to put it bluntly, the decision tree uses the known attributes of things to construct the determination of things. The way data is divided has been described in the previous article. I will not talk about it here, the previous section did not show how to use a self-built decision tree to test the newly added data. The following is the decision code:
def classify(inputTree,featLabels,testVec): firstStr = list(inputTree.keys())[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]).__name__=='dict': classLabel = classify(secondDict[key],featLabels,testVec) else: classLabel = secondDict[key] return classLabel def classify(inputTree,featLabels,testVec): firstStr = list(inputTree.keys())[0] secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex] == key: if type(secondDict[key]).__name__=='dict': classLabel = classify(secondDict[key],featLabels,testVec) else: classLabel = secondDict[key] return classLabel
Roar, the test code of this order is to complete the classification decision for the given data. In fact, it is to traverse the entire tree until it reaches the leaf node. Run the program as well: of course, for the sake of insurance: I still give all the source code, so that I can directly run it, operate and modify it into my own code without looking at the previous articles. Import math import operator def calcShannonEnt (dataset): numEntries = len (dataset) labelCounts ={} for featVec in dataset: currentLabel = featVec [-1] if currentLabel not in labelCounts. keys (): labelCounts [currentLabel] = 0 labelCounts [currentLabel] + = 1 shannonEnt = 0.0 for key in labelCounts: prob = float (labelCounts [key]) /numEntries shannonEnt-= prob * math. log (prob, 2) return shannonEnt def CreateDataSet (): dataset = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no'] labels = ['no surfacing ', 'flippers'] return dataset, labels def splitDataSet (dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec [axis] = value: reducedFeatVec = featVec [: axis] reducedFeatVec. extend (featVec [axis + 1:]) retDataSet. append (reducedFeatVec) return retDataSet def evaluate (dataSet): numberFeatures = len (dataSet [0])-1 baseEntropy = calcShannonEnt (dataSet) bestInfoGain = 0.0; bestFeature =-1; for I in range (numberFeatures): featList = [example [I] for example in dataSet] print (featList) uniqueVals = set (featList) print (uniqueVals) newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet (dataSet, I, value) prob = len (subDataSet)/float (len (dataSet )) newEntropy + = prob * calcShannonEnt (subDataSet) infoGain = baseEntropy-newEntropy if (infoGain> signature): Signature = infoGain bestFeature = I return bestFeature def majorityCnt (classList ): classCount ={} for vote in classList: if vote not in classCount. keys (): classCount [vote] = 0 classCount [vote] = 1 sortedClassCount = sorted (classCount. iteritems (), key = operator. itemgetter (1), reverse = True) return sortedClassCount [0] [0] def createTree (dataSet, inputlabels): labels = inputlabels [:] classList = [example [-1] for example in dataSet] if classList. count (classList [0]) = len (classList): return classList [0] if len (dataSet [0]) = 1: return majorityCnt (classList) bestFeat = chooseBestFeatureToSplit (dataSet) bestFeatLabel = labels [bestFeat] myTree = {bestFeatLabel :{}} del (labels [bestFeat]) featValues = [example [bestFeat] for example in dataSet] uniqueVals = set (featValues) for value in uniqueVals: subLabels = labels [:] myTree [bestFeatLabel] [value] = createTree (splitDataSet (dataSet, bestFeat, value), subLabels) return myTree def classify (inputTree, featLabels, testVec): firstStr = list (inputTree. keys () [0] secondDict = inputTree [firstStr] featIndex = featLabels. index (firstStr) for key in secondDict. keys (): if testVec [featIndex] = key: if type (secondDict [key]). _ name __= = 'dict ': classLabel = classify (secondDict [key], featLabels, testVec) else: classLabel = secondDict [key] return classLabel myDat, labels = CreateDataSet () print (calcShannonEnt (myDat) print (splitDataSet (myDat, 1, 1) print (chooseBestFeatureToSplit (myDat) myTree = createTree (myDat, labels) print (classify (myTree, labels, [1, 0]) print (classify (myTree, labels, [1, 1]) import mathimport operatordef calcShannonEnt (dataset): numEntries = len (dataset) labelCounts ={} for featVec in dataset: currentLabel = featVec [-1] if currentLabel not in labelCounts. keys (): labelCounts [currentLabel] = 0 labelCounts [currentLabel] + = 1 shannonEnt = 0.0 for key in labelCounts: prob = float (labelCounts [key]) /numEntries shannonEnt-= prob * math. log (prob, 2) return shannonEnt def CreateDataSet (): dataset = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no'] labels = ['no surfacing ', 'flippers'] return dataset, labelsdef splitDataSet (dataSet, axis, value): retDataSet = [] for featVec in dataSet: if featVec [axis] = value: reducedFeatVec = featVec [: axis] reducedFeatVec. extend (featVec [axis + 1:]) retDataSet. append (reducedFeatVec) return retDataSetdef evaluate (dataSet): numberFeatures = len (dataSet [0])-1 baseEntropy = calcShannonEnt (dataSet) bestInfoGain = 0.0; bestFeature =-1; for I in range (numberFeatures): featList = [example [I] for example in dataSet] print (featList) uniqueVals = set (featList) print (uniqueVals) newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet (dataSet, I, value) prob = len (subDataSet)/float (len (dataSet )) newEntropy + = prob * calcShannonEnt (subDataSet) infoGain = baseEntropy-newEntropy if (infoGain> bestInfoGain): Signature = infoGain bestFeature = I return principal majorityCnt (classList ): classCount ={} for vote in classList: if vote not in classCount. keys (): classCount [vote] = 0 classCount [vote] = 1 sortedClassCount = sorted (classCount. iteritems (), key = operator. itemgetter (1), reverse = True) return sortedClassCount [0] [0] def createTree (dataSet, inputlabels): labels = inputlabels [:] classList = [example [-1] for example in dataSet] if classList. count (classList [0]) = len (classList): return classList [0] if len (dataSet [0]) = 1: return majorityCnt (classList) bestFeat = chooseBestFeatureToSplit (dataSet) bestFeatLabel = labels [bestFeat] myTree = {bestFeatLabel :{}} del (labels [bestFeat]) featValues = [example [bestFeat] for example in dataSet] uniqueVals = set (featValues) for value in uniqueVals: subLabels = labels [:] myTree [bestFeatLabel] [value] = createTree (splitDataSet (dataSet, bestFeat, value), subLabels) return myTree def classify (inputTree, featLabels, testVec): firstStr = list (inputTree. keys () [0] secondDict = inputTree [firstStr] featIndex = featLabels. index (firstStr) for key in secondDict. keys (): if testVec [featIndex] = key: if type (secondDict [key]). _ name __= = 'dict ': classLabel = classify (secondDict [key], featLabels, testVec) else: classLabel = secondDict [key] return classLabel myDat, labels = CreateDataSet () print (calcShannonEnt (myDat) print (splitDataSet (myDat, 1, 1) print (chooseBestFeatureToSplit (myDat) myTree = createTree (myDat, labels) print (classify (myTree, labels, [1, 0]) print (classify (myTree, labels, [1, 1])
Hey, so we have completed all our decision tree practices and wish you a pleasant learning experience.