ID3決策樹演算法實現(Python版),id3python

來源:互聯網
上載者:User

ID3決策樹演算法實現(Python版),id3python

  1 # -*- coding:utf-8 -*-  2   3 from numpy import *  4 import numpy as np  5 import pandas as pd  6 from math import log  7 import operator  8   9 #計算資料集的香農熵 10 def calcShannonEnt(dataSet): 11     numEntries=len(dataSet) 12     labelCounts={} 13     #給所有可能分類建立字典 14     for featVec in dataSet: 15         currentLabel=featVec[-1] 16         if currentLabel not in labelCounts.keys(): 17             labelCounts[currentLabel]=0 18         labelCounts[currentLabel]+=1 19     shannonEnt=0.0 20     #以2為底數計算香農熵 21     for key in labelCounts: 22         prob = float(labelCounts[key])/numEntries 23         shannonEnt-=prob*log(prob,2) 24     return shannonEnt 25  26  27 #對離散變數劃分資料集,取出該特徵取值為value的所有樣本 28 def splitDataSet(dataSet,axis,value): 29     retDataSet=[] 30     for featVec in dataSet: 31         if featVec[axis]==value: 32             reducedFeatVec=featVec[:axis] 33             reducedFeatVec.extend(featVec[axis+1:]) 34             retDataSet.append(reducedFeatVec) 35     return retDataSet 36  37 #對連續變數劃分資料集,direction規定劃分的方向, 38 #決定是劃分出小於value的資料樣本還是大於value的資料樣本集 39 def splitContinuousDataSet(dataSet,axis,value,direction): 40     retDataSet=[] 41     for featVec in dataSet: 42         if direction==0: 43             if featVec[axis]>value: 44                 reducedFeatVec=featVec[:axis] 45                 reducedFeatVec.extend(featVec[axis+1:]) 46                 retDataSet.append(reducedFeatVec) 47         else: 48             if featVec[axis]<=value: 49                 reducedFeatVec=featVec[:axis] 50                 reducedFeatVec.extend(featVec[axis+1:]) 51                 retDataSet.append(reducedFeatVec) 52     return retDataSet 53  54 #選擇最好的資料集劃分方式 55 def chooseBestFeatureToSplit(dataSet,labels): 56     numFeatures=len(dataSet[0])-1 57     baseEntropy=calcShannonEnt(dataSet) 58     bestInfoGain=0.0 59     bestFeature=-1 60     bestSplitDict={} 61     for i in range(numFeatures): 62         featList=[example[i] for example in dataSet] 63         #對連續型特徵進行處理 64         if type(featList[0]).__name__=='float' or type(featList[0]).__name__=='int': 65             #產生n-1個候選劃分點 66             sortfeatList=sorted(featList) 67             splitList=[] 68             for j in range(len(sortfeatList)-1): 69                 splitList.append((sortfeatList[j]+sortfeatList[j+1])/2.0) 70  71             bestSplitEntropy=10000 72             slen=len(splitList) 73             #求用第j個候選劃分點劃分時,得到的資訊熵,並記錄最佳劃分點 74             for j in range(slen): 75                 value=splitList[j] 76                 newEntropy=0.0 77                 subDataSet0=splitContinuousDataSet(dataSet,i,value,0) 78                 subDataSet1=splitContinuousDataSet(dataSet,i,value,1) 79                 prob0=len(subDataSet0)/float(len(dataSet)) 80                 newEntropy+=prob0*calcShannonEnt(subDataSet0) 81                 prob1=len(subDataSet1)/float(len(dataSet)) 82                 newEntropy+=prob1*calcShannonEnt(subDataSet1) 83                 if newEntropy<bestSplitEntropy: 84                     bestSplitEntropy=newEntropy 85                     bestSplit=j 86             #用字典記錄當前特徵的最佳劃分點 87             bestSplitDict[labels[i]]=splitList[bestSplit] 88             infoGain=baseEntropy-bestSplitEntropy 89         #對離散型特徵進行處理 90         else: 91             uniqueVals=set(featList) 92             newEntropy=0.0 93             #計算該特徵下每種劃分的資訊熵 94             for value in uniqueVals: 95                 subDataSet=splitDataSet(dataSet,i,value) 96                 prob=len(subDataSet)/float(len(dataSet)) 97                 newEntropy+=prob*calcShannonEnt(subDataSet) 98             infoGain=baseEntropy-newEntropy 99         if infoGain>bestInfoGain:100             bestInfoGain=infoGain101             bestFeature=i102     #若當前節點的最佳劃分特徵為連續特徵,則將其以之前記錄的劃分點為界進行二值化處理103     #即是否小於等於bestSplitValue104     if type(dataSet[0][bestFeature]).__name__=='float' or type(dataSet[0][bestFeature]).__name__=='int':105         bestSplitValue=bestSplitDict[labels[bestFeature]]106         labels[bestFeature]=labels[bestFeature]+'<='+str(bestSplitValue)107         for i in range(shape(dataSet)[0]):108             if dataSet[i][bestFeature]<=bestSplitValue:109                 dataSet[i][bestFeature]=1110             else:111                 dataSet[i][bestFeature]=0112     return bestFeature113 114 #特徵若已經劃分完,節點下的樣本還沒有統一取值,則需要進行投票115 def majorityCnt(classList):116     classCount={}117     for vote in classList:118         if vote not in classCount.keys():119             classCount[vote]=0120         classCount[vote]+=1121     return max(classCount)122 123 #主程式,遞迴產生決策樹124 def createTree(dataSet,labels,data_full,labels_full):125     classList=[example[-1] for example in dataSet]126     if classList.count(classList[0])==len(classList):127         return classList[0]128     if len(dataSet[0])==1:129         return majorityCnt(classList)130     bestFeat=chooseBestFeatureToSplit(dataSet,labels)131     bestFeatLabel=labels[bestFeat]132     myTree={bestFeatLabel:{}}133     featValues=[example[bestFeat] for example in dataSet]134     uniqueVals=set(featValues)135     if type(dataSet[0][bestFeat]).__name__=='str':136         currentlabel=labels_full.index(labels[bestFeat])137         featValuesFull=[example[currentlabel] for example in data_full]138         uniqueValsFull=set(featValuesFull)139     del(labels[bestFeat])140     #針對bestFeat的每個取值,劃分出一個子樹。141     for value in uniqueVals:142         subLabels=labels[:]143         if type(dataSet[0][bestFeat]).__name__=='str':144             uniqueValsFull.remove(value)145         myTree[bestFeatLabel][value]=createTree(splitDataSet\146          (dataSet,bestFeat,value),subLabels,data_full,labels_full)147     if type(dataSet[0][bestFeat]).__name__=='str':148         for value in uniqueValsFull:149             myTree[bestFeatLabel][value]=majorityCnt(classList)150     return myTree151 152 import matplotlib.pyplot as plt153 decisionNode=dict(boxstyle="sawtooth",fc="0.8")154 leafNode=dict(boxstyle="round4",fc="0.8")155 arrow_args=dict(arrowstyle="<-")156 157 158 #計算樹的葉子節點數量159 def getNumLeafs(myTree):160     numLeafs=0161     firstSides = list(myTree.keys())162     firstStr=firstSides[0]163     secondDict=myTree[firstStr]164     for key in secondDict.keys():165         if type(secondDict[key]).__name__=='dict':166             numLeafs+=getNumLeafs(secondDict[key])167         else: numLeafs+=1168     return numLeafs169 170 #計算樹的最大深度171 def getTreeDepth(myTree):172     maxDepth=0173     firstSides = list(myTree.keys())174     firstStr=firstSides[0]175     secondDict=myTree[firstStr]176     for key in secondDict.keys():177         if type(secondDict[key]).__name__=='dict':178             thisDepth=1+getTreeDepth(secondDict[key])179         else: thisDepth=1180         if thisDepth>maxDepth:181             maxDepth=thisDepth182     return maxDepth183 184 #畫節點185 def plotNode(nodeTxt,centerPt,parentPt,nodeType):186     createPlot.ax1.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',\187     xytext=centerPt,textcoords='axes fraction',va="center", ha="center",\188     bbox=nodeType,arrowprops=arrow_args)189 190 #畫箭頭上的文字191 def plotMidText(cntrPt,parentPt,txtString):192     lens=len(txtString)193     xMid=(parentPt[0]+cntrPt[0])/2.0-lens*0.002194     yMid=(parentPt[1]+cntrPt[1])/2.0195     createPlot.ax1.text(xMid,yMid,txtString)196 197 def plotTree(myTree,parentPt,nodeTxt):198     numLeafs=getNumLeafs(myTree)199     depth=getTreeDepth(myTree)200     firstSides = list(myTree.keys())201     firstStr=firstSides[0]202     cntrPt=(plotTree.x0ff+(1.0+float(numLeafs))/2.0/plotTree.totalW,plotTree.y0ff)203     plotMidText(cntrPt,parentPt,nodeTxt)204     plotNode(firstStr,cntrPt,parentPt,decisionNode)205     secondDict=myTree[firstStr]206     plotTree.y0ff=plotTree.y0ff-1.0/plotTree.totalD207     for key in secondDict.keys():208         if type(secondDict[key]).__name__=='dict':209             plotTree(secondDict[key],cntrPt,str(key))210         else:211             plotTree.x0ff=plotTree.x0ff+1.0/plotTree.totalW212             plotNode(secondDict[key],(plotTree.x0ff,plotTree.y0ff),cntrPt,leafNode)213             plotMidText((plotTree.x0ff,plotTree.y0ff),cntrPt,str(key))214     plotTree.y0ff=plotTree.y0ff+1.0/plotTree.totalD215 216 def createPlot(inTree):217     fig=plt.figure(1,facecolor='white')218     fig.clf()219     axprops=dict(xticks=[],yticks=[])220     createPlot.ax1=plt.subplot(111,frameon=False,**axprops)221     plotTree.totalW=float(getNumLeafs(inTree))222     plotTree.totalD=float(getTreeDepth(inTree))223     plotTree.x0ff=-0.5/plotTree.totalW224     plotTree.y0ff=1.0225     plotTree(inTree,(0.5,1.0),'')226     plt.show()227 228 df=pd.read_csv('watermelon_4_3.csv')229 data=df.values[:,1:].tolist()230 data_full=data[:]231 labels=df.columns.values[1:-1].tolist()232 labels_full=labels[:]233 myTree=createTree(data,labels,data_full,labels_full)234 print(myTree)235 createPlot(myTree)
最終結果如下:{'texture': {'blur': 0, 'little_blur': {'touch': {'soft_stick': 1, 'hard_smooth': 0}}, 'distinct': {'density<=0.38149999999999995': {0: 1, 1: 0}}}} 得到的決策樹如下: 參考資料:《機器學習實戰》《機器學習》周志華著

相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.