資料採礦-決策樹ID3分類演算法的C++實現

來源:互聯網
上載者:User

資料採礦課上面老師介紹了下決策樹ID3演算法,我抽空餘時間把這個演算法用C++實現了一遍。

決策樹演算法是非常常用的分類演算法,是逼近離散目標函數的方法,學習得到的函數以決策樹的形式表示。其基本思路是不斷選取產生資訊增益最大的屬性來劃分範例集和,構造決策樹。資訊增益定義為結點與其子結點的資訊熵之差。資訊熵是香農提出的,用於描述資訊不純度(不穩定性),其計算公式是

Pi為子集合中不同性(而二元分類即正範例和負範例)的範例的比例。這樣資訊收益可以定義為樣本按照某屬性劃分時造成熵減少的期望,可以區分訓練樣本中正負樣本的能力,其計算公司是

我實現該演算法針對的範例集合如下

該表記錄了在不同氣候條件下是否去打球的情況,要求根據該表用程式輸出決策樹

C++代碼如下,程式中有詳細注釋

#include <iostream>#include <string>#include <vector>#include <map>#include <algorithm>#include <cmath>using namespace std;#define MAXLEN 6//輸入每行的資料個數//多叉樹的實現 //1 廣義表//2 父指標標記法,適於經常找父結點的應用//3 子女鏈標記法,適於經常找子結點的應用//4 左長子,右兄弟標記法,實現比較麻煩//5 每個結點的所有孩子用vector儲存//教訓:資料結構的設計很重要,本演算法採用5比較合適,同時//注意維護剩餘範例和剩餘屬性資訊,建樹時橫向遍曆考迴圈屬性的值,//縱向遍曆靠遞迴調用vector <vector <string> > state;//執行個體集vector <string> item(MAXLEN);//對應一行執行個體集vector <string> attribute_row;//儲存首行即屬性行資料string end("end");//輸入結束string yes("yes");string no("no");string blank("");map<string,vector < string > > map_attribute_values;//儲存屬性對應的所有的值int tree_size = 0;struct Node{//決策樹節點string attribute;//屬性值string arrived_value;//到達的屬性值vector<Node *> childs;//所有的孩子Node(){attribute = blank;arrived_value = blank;}};Node * root;//根據資料執行個體計算屬性與值組成的mapvoid ComputeMapFrom2DVector(){unsigned int i,j,k;bool exited = false;vector<string> values;for(i = 1; i < MAXLEN-1; i++){//按照列遍曆for (j = 1; j < state.size(); j++){for (k = 0; k < values.size(); k++){if(!values[k].compare(state[j][i])) exited = true;}if(!exited){values.push_back(state[j][i]);//注意Vector的插入都是從前面插入的,注意更新it,始終指向vector頭}exited = false;}map_attribute_values[state[0][i]] = values;values.erase(values.begin(), values.end());}}//根據具體屬性和值來計算熵double ComputeEntropy(vector <vector <string> > remain_state, string attribute, string value,bool ifparent){vector<int> count (2,0);unsigned int i,j;bool done_flag = false;//哨兵值for(j = 1; j < MAXLEN; j++){if(done_flag) break;if(!attribute_row[j].compare(attribute)){for(i = 1; i < remain_state.size(); i++){if((!ifparent&&!remain_state[i][j].compare(value)) || ifparent){//ifparent記錄是否算父節點if(!remain_state[i][MAXLEN - 1].compare(yes)){count[0]++;}else count[1]++;}}done_flag = true;}}if(count[0] == 0 || count[1] == 0 ) return 0;//全部是正執行個體或者負執行個體//具體計算熵 根據[+count[0],-count[1]],log2為底通過換底公式換成自然數底數double sum = count[0] + count[1];double entropy = -count[0]/sum*log(count[0]/sum)/log(2.0) - count[1]/sum*log(count[1]/sum)/log(2.0);return entropy;}//計算按照屬性attribute劃分當前剩餘執行個體的資訊增益double ComputeGain(vector <vector <string> > remain_state, string attribute){unsigned int j,k,m;//首先求不做劃分時的熵double parent_entropy = ComputeEntropy(remain_state, attribute, blank, true);double children_entropy = 0;//然後求做劃分後各個值的熵vector<string> values = map_attribute_values[attribute];vector<double> ratio;vector<int> count_values;int tempint;for(m = 0; m < values.size(); m++){tempint = 0;for(k = 1; k < MAXLEN - 1; k++){if(!attribute_row[k].compare(attribute)){for(j = 1; j < remain_state.size(); j++){if(!remain_state[j][k].compare(values[m])){tempint++;}}}}count_values.push_back(tempint);}for(j = 0; j < values.size(); j++){ratio.push_back((double)count_values[j] / (double)(remain_state.size()-1));}double temp_entropy;for(j = 0; j < values.size(); j++){temp_entropy = ComputeEntropy(remain_state, attribute, values[j], false);children_entropy += ratio[j] * temp_entropy;}return (parent_entropy - children_entropy);}int FindAttriNumByName(string attri){for(int i = 0; i < MAXLEN; i++){if(!state[0][i].compare(attri)) return i;}cerr<<"can't find the numth of attribute"<<endl; return 0;}//找出範例中佔多數的正/負性string MostCommonLabel(vector <vector <string> > remain_state){int p = 0, n = 0;for(unsigned i = 0; i < remain_state.size(); i++){if(!remain_state[i][MAXLEN-1].compare(yes)) p++;else n++;}if(p >= n) return yes;else return no;}//判斷範例是否正負性都為labelbool AllTheSameLabel(vector <vector <string> > remain_state, string label){int count = 0;for(unsigned int i = 0; i < remain_state.size(); i++){if(!remain_state[i][MAXLEN-1].compare(label)) count++;}if(count == remain_state.size()-1) return true;else return false;}//計算資訊增益,DFS構建決策樹//current_node為當前的節點//remain_state為剩餘待分類的範例//remian_attribute為剩餘還沒有考慮的屬性//返回根結點指標Node * BulidDecisionTreeDFS(Node * p, vector <vector <string> > remain_state, vector <string> remain_attribute){//if(remain_state.size() > 0){//printv(remain_state);//}if (p == NULL)p = new Node();//先看搜尋到樹葉的情況if (AllTheSameLabel(remain_state, yes)){p->attribute = yes;return p;}if (AllTheSameLabel(remain_state, no)){p->attribute = no;return p;}if(remain_attribute.size() == 0){//所有的屬性均已經考慮完了,還沒有分盡string label = MostCommonLabel(remain_state);p->attribute = label;return p;}double max_gain = 0, temp_gain;vector <string>::iterator max_it = remain_attribute.begin();vector <string>::iterator it1;for(it1 = remain_attribute.begin(); it1 < remain_attribute.end(); it1++){temp_gain = ComputeGain(remain_state, (*it1));if(temp_gain > max_gain) {max_gain = temp_gain;max_it = it1;}}//下面根據max_it指向的屬性來劃分當前範例,更新範例集和屬性集vector <string> new_attribute;vector <vector <string> > new_state;for(vector <string>::iterator it2 = remain_attribute.begin(); it2 < remain_attribute.end(); it2++){if((*it2).compare(*max_it)) new_attribute.push_back(*it2);}//確定了最佳劃分屬性,注意儲存p->attribute = *max_it;vector <string> values = map_attribute_values[*max_it];int attribue_num = FindAttriNumByName(*max_it);new_state.push_back(attribute_row);for(vector <string>::iterator it3 = values.begin(); it3 < values.end(); it3++){for(unsigned int i = 1; i < remain_state.size(); i++){if(!remain_state[i][attribue_num].compare(*it3)){new_state.push_back(remain_state[i]);}}Node * new_node = new Node();new_node->arrived_value = *it3;if(new_state.size() == 0){//表示當前沒有這個分支的範例,當前的new_node為葉子節點new_node->attribute = MostCommonLabel(remain_state);}else BulidDecisionTreeDFS(new_node, new_state, new_attribute);//遞迴函式返回時即回溯時需要1 將新結點加入父節點孩子容器 2清除new_state容器p->childs.push_back(new_node);new_state.erase(new_state.begin()+1,new_state.end());//注意先清空new_state中的前一個取值的範例,準備遍曆下一個取值範例}return p;}void Input(){string s;while(cin>>s,s.compare(end) != 0){//-1為輸入結束item[0] = s;for(int i = 1;i < MAXLEN; i++){cin>>item[i];}state.push_back(item);//注意首行資訊也輸入進去,即屬性}for(int j = 0; j < MAXLEN; j++){attribute_row.push_back(state[0][j]);}}void PrintTree(Node *p, int depth){for (int i = 0; i < depth; i++) cout << '\t';//按照樹的深度先輸出tabif(!p->arrived_value.empty()){cout<<p->arrived_value<<endl;for (int i = 0; i < depth+1; i++) cout << '\t';//按照樹的深度先輸出tab}cout<<p->attribute<<endl;for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++){PrintTree(*it, depth + 1);}}void FreeTree(Node *p){if (p == NULL)return;for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++){FreeTree(*it);}delete p;tree_size++;}int main(){Input();vector <string> remain_attribute;string outlook("Outlook");string Temperature("Temperature");string Humidity("Humidity");string Wind("Wind");remain_attribute.push_back(outlook);remain_attribute.push_back(Temperature);remain_attribute.push_back(Humidity);remain_attribute.push_back(Wind);vector <vector <string> > remain_state;for(unsigned int i = 0; i < state.size(); i++){remain_state.push_back(state[i]); }ComputeMapFrom2DVector();root = BulidDecisionTreeDFS(root,remain_state,remain_attribute);cout<<"the decision tree is :"<<endl;PrintTree(root,0);FreeTree(root);cout<<endl;cout<<"tree_size:"<<tree_size<<endl;return 0;}

輸入的訓練資料如下

Day Outlook Temperature Humidity Wind PlayTennis1 Sunny Hot High Weak no2 Sunny Hot High Strong no3 Overcast Hot High Weak yes4 Rainy Mild High Weak yes5 Rainy Cool Normal Weak yes6 Rainy Cool Normal Strong no7 Overcast Cool Normal Strong yes8 Sunny Mild High Weak no9 Sunny Cool Normal Weak yes10 Rainy Mild Normal Weak yes11 Sunny Mild Normal Strong yes12 Overcast Mild High Strong yes13 Overcast Hot Normal Weak yes14 Rainy Mild High Strong noend

程式輸出決策樹如下

可以用圖形表示為

有了決策樹後,就可以根據氣候條件做預測了

例如如果氣候資料是{Sunny,Cool,Normal,Strong} ,根據決策樹到左側的yes分葉節點,可以判定會去遊泳。

另外在編寫這個程式時在資料結構的設計上面走了彎路,多叉樹的實現有很多方法,本演算法採用每個結點的所有孩子用vector儲存比較合適,同時注意維護剩餘範例和剩餘屬性資訊,建樹時橫向遍曆靠迴圈屬性的值,縱向遍曆靠遞迴調用 ,總體是DFS,樹和圖的遍曆在編程時經常遇到,得熟練掌握。程式有些地方的效率還得最佳化,有不足的點地方還望大家拍磚。

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.