决策树进行多属性分类 C++实现

xiaoxiao2021-02-28  123

#include"stdafx.h" #include "stdlib.h" #include<iostream> #include<string> #include<vector> #include<map> #include<algorithm> #include<cmath> #include<fstream> using namespace std; #define MAXLEN 6//定义的属性个数 ifstream fin("data.txt"); ofstream fout("result.txt"); //自定义的决策树节点 struct Node { string attribute;//属性值 string arrived_value;//到达的属性值 vector<Node *> childs;//所有的分差节点,vector装入 Node() { // 初始化的时候均设为空值 attribute = ""; arrived_value = ""; } }*root; vector <vector <string> > state;//保存整个实例集 vector <string> item(MAXLEN);//对应实例集中的某一行 vector <string> attribute_row;//保存首行即属性行数据 string attrName[MAXLEN] = { "RID", "model", "perfume", "price", "income", "buyaction" }; map<string, vector <string> > map_attribute_values;//存储属性对应的所有的值 int tree_size = 0; // 将数据以文件流的形式输入 void DataInit() { string s; int i, j; while (fin >> s, s.compare("end") != 0) { //当输入结束时s.compare("end")的值为-1 item[0] = s; //第一个值保存的是RID for (int i = 1; i < MAXLEN; i++) { fin >> item[i]; } state.push_back(item); } for (j = 0; j < MAXLEN; j++) { attribute_row.push_back(attrName[j]); // 初始化属性栏值 } fin.close(); } // log2为底通过换底公式换成自然数底数 double lg2(double n) { return log(n) / log(2); } //根据数据实例存储属性对应的所有的值 void MapAttributeValue() { unsigned int i, j, k; bool exited = false; vector<string> values; //按照列遍历(每个属性对应一列) for (i = 1; i < MAXLEN - 1; i++) { for (j = 0; j < state.size(); j++) { for (k = 0; k < values.size(); k++) { //如果该属性值已经存在则退出循环 if (!values[k].compare(state[j][i])) { exited = true; break; } } //如果循环后的exited值仍为false说明该属性值并未出现过 //将其加入到暂时存放的向量中 if (!exited) { values.push_back(state[j][i]); } exited = false; //注意此处重置值 } map_attribute_values[attrName[i]] = values; //相对应的放入 values.clear(); //为了存放下一次的属性值清空 } } //根据具体属性和值来计算熵 double Entropy(vector <vector<string> > remain_state, string attribute, string value, bool ifparent) { vector<int> count(2, 0); int i, j; bool flag = false; for (j = 1; j < MAXLEN; j++) { if (flag) break; if (!attribute_row[j].compare(attribute)) { for (i = 1; i < remain_state.size(); i++) { if ((!ifparent&&!remain_state[i][j].compare(value)) || ifparent) {//ifparent记录是否算父节点 if (!remain_state[i][MAXLEN - 1].compare("yes")) { count[0]++; //count[0]记录实例中yes的值 } else count[1]++;//count[1]记录实例中no的值 } } flag = true; } } //全部是正实例或者负实例 if (count[0] == 0 || count[1] == 0) return 0; //具体计算熵 double sum = count[0] + count[1]; double entropy = -count[0] / sum * lg2((double)count[0] / sum) - count[1] / sum * lg2((double)count[1] / sum); return entropy; } //计算按照属性attribute划分当前剩余实例的信息增益 double Gain(vector <vector <string> > remain_state, string attribute) { unsigned int j, k, m; //首先求不做划分时的熵,即根的熵值 double parent_entropy = Entropy(remain_state, attribute, "", true); double children_entropy = 0; //然后求做划分后各个值的熵 vector<string> values = map_attribute_values[attribute]; vector<double> ratio;//存放每一个属性值在实例中的期望 vector<int> count_values; int tempint;//存放出现的次数 for (m = 0; m < values.size(); m++) { tempint = 0; for (k = 1; k < MAXLEN - 1; k++) { if (!attribute_row[k].compare(attribute)) { for (j = 1; j < remain_state.size(); j++) { if (!remain_state[j][k].compare(values[m])) { tempint++; } } } } count_values.push_back(tempint); } for (j = 0; j < values.size(); j++) { ratio.push_back((double)count_values[j] / (double)(remain_state.size() - 1)); } double temp_entropy; for (j = 0; j < values.size(); j++) { temp_entropy = Entropy(remain_state, attribute, values[j], false); children_entropy += ratio[j] * temp_entropy; //权值的累加 } return (parent_entropy - children_entropy); } //根据属性名查找到对应的AttriNum int FindAttriNumByName(string attri) { for (int i = 0; i < MAXLEN; i++) { if (!attrName[i].compare(attri)) return i; } cerr << "can't find the numth of attribute" << endl; return 0; } //找出样例中占多数的正/负性 string MostCommonLabel(vector <vector <string> > remain_state) { int y = 0, n = 0; int i; for (i = 0; i < remain_state.size(); i++) { if (!remain_state[i][MAXLEN - 1].compare("yes")) y++; else n++; } if (y >= n) return "yes"; else return "no"; } //若子集只含有单个属性,则分支为叶子节点 //判断样例是否正负性都一致,即是否到达叶子节点 bool IsReachLeaveNode(vector <vector <string> > remain_state, string label) { int count = 0, i; for (i = 0; i < remain_state.size(); i++) { if (!remain_state[i][MAXLEN - 1].compare(label)) count++; } if (count == remain_state.size() - 1) return true; else return false; } //生成决策树的方法 Node * BulidDecisionTree(Node * p, vector <vector <string> > remain_state, vector <string> remain_attribute) { int i; if (p == NULL) p = new Node(); //先看搜索到树叶的情况 if (IsReachLeaveNode(remain_state, "yes")) { p->attribute = "yes"; return p; } if (IsReachLeaveNode(remain_state, "no")) { p->attribute = "no"; return p; } //所有的属性均已经考虑完了,还没有分尽 if (remain_attribute.size() == 0) { string label = MostCommonLabel(remain_state); p->attribute = label; return p; } double max_gain = 0; //记录最大的熵值 double temp_gain; //信息增益算出来不一定大于0 //max_it应该初始化为remain_attribute.begin() vector <string>::iterator max_it = remain_attribute.begin(); vector <string>::iterator ittmp = remain_attribute.begin(); for (; ittmp < remain_attribute.end(); ittmp++) { temp_gain = Gain(remain_state, (*ittmp)); //求出指针对应属性的熵值 if (temp_gain > max_gain) {//若找到更佳的属性则转化 max_gain = temp_gain; max_it = ittmp; } } //下面根据max_it指向的属性来划分当前样例,更新样例集和属性集 vector <string> new_attribute;//保存去除最佳属性后的剩余属性 vector <vector <string> > new_state;//保存剩余的实例 vector <string>::iterator it2 = remain_attribute.begin(); for (; it2 < remain_attribute.end(); it2++) { if ((*it2).compare(*max_it)) new_attribute.push_back(*it2); } //确定了最佳划分属性,注意保存 p->attribute = *max_it; vector <string> values; //存放最佳属性的属性值 values = map_attribute_values[*max_it]; int attribue_num = FindAttriNumByName(*max_it); new_state.push_back(attribute_row);//将第一行的属性先包括 vector <string>::iterator it3 = values.begin(); for (; it3 < values.end(); it3++) { for (i = 1; i < remain_state.size(); i++) { if (!remain_state[i][attribue_num].compare(*it3)) { new_state.push_back(remain_state[i]); } } Node * new_node = new Node(); new_node->arrived_value = *it3; if (new_state.size() == 0) {//表示当前没有这个分支的样例,当前的new_node为叶子节点 new_node->attribute = MostCommonLabel(remain_state); } else BulidDecisionTree(new_node, new_state, new_attribute); p->childs.push_back(new_node);//将新结点加入父节点孩子容器 //注意先清空new_state中的前一个取值的样例,准备遍历下一个取值样例 new_state.erase(new_state.begin() + 1, new_state.end()); } return p; } //打印树的函数 void PrintTree(Node *p, int depth) { for (int i = 0; i < depth; i++) { fout << '\t';//按照树的深度先输出tab cout << '\t';//按照树的深度先输出tab } if (!p->arrived_value.empty()) { fout << p->arrived_value << endl; cout << p->arrived_value << endl; for (int i = 0; i < depth + 1; i++) { fout << '\t';//按照树的深度先输出tab cout << '\t';//按照树的深度先输出tab } } fout << p->attribute << endl; cout << p->attribute << endl; for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++) { PrintTree(*it, depth + 1); //递归 } } //对单条数据,输出最后的结果 string TestClassifiter(string test[]) { string ans = ""; Node* arrived_attibute = root; //初始化为root bool isFind; while (true) { isFind = false; if (arrived_attibute->childs.size() == 1) break; //此时说明已经达到叶子节点 vector<Node*>::iterator it = arrived_attibute->childs.begin(); //循环扫描分叉中的值 for (; it != arrived_attibute->childs.end(); it++) { for (int k = 0; k < 4; k++) { if ((*it)->arrived_value == test[k]) { isFind = true; break; } } if (isFind) break; } if (!isFind) break; arrived_attibute = *it; } //叶子节点输出 return arrived_attibute->attribute; } int main() { int i; DataInit(); //数据初始化 //remain_attribute存放需要划分的属性(一开始是所有属性) vector <string> remain_attribute; for (i = 1; i < MAXLEN - 1; i++) remain_attribute.push_back(attrName[i]); //remain_state存放实例集(一开始是整个实例集) vector <vector <string> > remain_state; for (unsigned int i = 0; i < state.size(); i++) { remain_state.push_back(state[i]); } MapAttributeValue();//将属性值映射到对应的属性 root = BulidDecisionTree(root, remain_state, remain_attribute); fout << "the decision tree is :" << endl; PrintTree(root, 0); //以下是数据测试 string test[] = { "small", "strong", "expensive", "low" }; cout << "the test result is: " << TestClassifiter(test) << endl; system("pause"); return 0; }

data数据如下:

1 small strong expensive high no 2 small strong expensive a_bit_high no 3 small strong expensive common yes 4 small strong expensive low no 5 small strong a_litle_expensive high yes 6 small strong a_litle_expensive a_bit_high no 7 small strong a_litle_expensive common yes 8 small strong a_litle_expensive low no 9 small strong affordable high yes 10 small strong affordable a_bit_high yes 11 small strong affordable common yes 12 small strong affordable low no 13 small strong substantial high yes 14 small strong substantial a_bit_high no 15 small strong substantial common no 16 small strong substantial low no 17 small strong cheap high no 18 small strong cheap a_bit_high no 19 small strong cheap common no 20 small strong cheap low no 21 small elegant expensive high no 22 small elegant expensive a_bit_high no 23 small elegant expensive common yes 24 small elegant expensive low no 25 small elegant a_litle_expensive high no 26 small elegant a_litle_expensive a_bit_high yes 27 small elegant a_litle_expensive common no 28 small elegant a_litle_expensive low no 29 small elegant affordable high yes 30 small elegant affordable a_bit_high yes 31 small elegant affordable common no 32 small elegant affordable low no 33 small elegant substantial high yes 34 small elegant substantial a_bit_high yes 35 small elegant substantial common no 36 small elegant substantial low no 37 small elegant cheap high yes 38 small elegant cheap a_bit_high yes 39 small elegant cheap common no 40 small elegant cheap low no 41 small slight expensive high no 42 small slight expensive a_bit_high yes 43 small slight expensive common no 44 small slight expensive low no 45 small slight a_litle_expensive high yes 46 small slight a_litle_expensive a_bit_high no 47 small slight a_litle_expensive common yes 48 small slight a_litle_expensive low no 49 small slight affordable high no 50 small slight affordable a_bit_high no 51 small slight affordable common no 52 small slight affordable low no 53 small slight substantial high yes 54 small slight substantial a_bit_high no 55 small slight substantial common no 56 small slight substantial low no 57 small slight cheap high yes 58 small slight cheap a_bit_high no 59 small slight cheap common no 60 small slight cheap low no 61 smart strong expensive high yes 62 smart strong expensive a_bit_high no 63 smart strong expensive common no 64 smart strong expensive low no 65 smart strong a_litle_expensive high no 66 smart strong a_litle_expensive a_bit_high yes 67 smart strong a_litle_expensive common no 68 smart strong a_litle_expensive low no 69 smart strong affordable high yes 70 smart strong affordable a_bit_high yes 71 smart strong affordable common no 72 smart strong affordable low no 73 smart strong substantial high no 74 smart strong substantial a_bit_high no 75 smart strong substantial common no 76 smart strong substantial low no 77 smart strong cheap high no 78 smart strong cheap a_bit_high no 79 smart strong cheap common yes 80 smart strong cheap low no 81 smart elegant expensive high yes 82 smart elegant expensive a_bit_high no 83 smart elegant expensive common yes 84 smart elegant expensive low no 85 smart elegant a_litle_expensive high yes 86 smart elegant a_litle_expensive a_bit_high yes 87 smart elegant a_litle_expensive common yes 88 smart elegant a_litle_expensive low no 89 smart elegant affordable high yes 90 smart elegant affordable a_bit_high yes 91 smart elegant affordable common yes 92 smart elegant affordable low no 93 smart elegant substantial high yes 94 smart elegant substantial a_bit_high yes 95 smart elegant substantial common no 96 smart elegant substantial low no 97 smart elegant cheap high yes 98 smart elegant cheap a_bit_high no 99 smart elegant cheap common yes 100 smart elegant cheap low no 101 smart slight expensive high yes 102 smart slight expensive a_bit_high yes 103 smart slight expensive common no 104 smart slight expensive low no 105 smart slight a_litle_expensive high no 106 smart slight a_litle_expensive a_bit_high yes 107 smart slight a_litle_expensive common no 108 smart slight a_litle_expensive low no 109 smart slight affordable high no 110 smart slight affordable a_bit_high yes 111 smart slight affordable common yes 112 smart slight affordable low no 113 smart slight substantial high yes 114 smart slight substantial a_bit_high yes 115 smart slight substantial common yes 116 smart slight substantial low no 117 smart slight cheap high yes 118 smart slight cheap a_bit_high no 119 smart slight cheap common yes 120 smart slight cheap low no 121 middle strong expensive high yes 122 middle strong expensive a_bit_high no 123 middle strong expensive common yes 124 middle strong expensive low no 125 middle strong a_litle_expensive high no 126 middle strong a_litle_expensive a_bit_high no 127 middle strong a_litle_expensive common yes 128 middle strong a_litle_expensive low no 129 middle strong affordable high yes 130 middle strong affordable a_bit_high yes 131 middle strong affordable common no 132 middle strong affordable low no 133 middle strong substantial high no 134 middle strong substantial a_bit_high no 135 middle strong substantial common no 136 middle strong substantial low no 137 middle strong cheap high no 138 middle strong cheap a_bit_high yes 139 middle strong cheap common yes 140 middle strong cheap low no 141 middle elegant expensive high yes 142 middle elegant expensive a_bit_high no 143 middle elegant expensive common yes 144 middle elegant expensive low no 145 middle elegant a_litle_expensive high no 146 middle elegant a_litle_expensive a_bit_high yes 147 middle elegant a_litle_expensive common no 148 middle elegant a_litle_expensive low no 149 middle elegant affordable high no 150 middle elegant affordable a_bit_high yes 151 middle elegant affordable common yes 152 middle elegant affordable low no 153 middle elegant substantial high yes 154 middle elegant substantial a_bit_high no 155 middle elegant substantial common no 156 middle elegant substantial low no 157 middle elegant cheap high no 158 middle elegant cheap a_bit_high no 159 middle elegant cheap common yes 160 middle elegant cheap low no 161 middle slight expensive high yes 162 middle slight expensive a_bit_high yes 163 middle slight expensive common no 164 middle slight expensive low no 165 middle slight a_litle_expensive high yes 166 middle slight a_litle_expensive a_bit_high no 167 middle slight a_litle_expensive common no 168 middle slight a_litle_expensive low no 169 middle slight affordable high no 170 middle slight affordable a_bit_high yes 171 middle slight affordable common no 172 middle slight affordable low no 173 middle slight substantial high yes 174 middle slight substantial a_bit_high yes 175 middle slight substantial common yes 176 middle slight substantial low no 177 middle slight cheap high no 178 middle slight cheap a_bit_high yes 179 middle slight cheap common yes 180 middle slight cheap low no 181 large strong expensive high yes 182 large strong expensive a_bit_high no 183 large strong expensive common no 184 large strong expensive low no 185 large strong a_litle_expensive high no 186 large strong a_litle_expensive a_bit_high no 187 large strong a_litle_expensive common no 188 large strong a_litle_expensive low no 189 large strong affordable high no 190 large strong affordable a_bit_high no 191 large strong affordable common no 192 large strong affordable low no 193 large strong substantial high no 194 large strong substantial a_bit_high no 195 large strong substantial common no 196 large strong substantial low no 197 large strong cheap high yes 198 large strong cheap a_bit_high no 199 large strong cheap common yes 200 large strong cheap low no 201 large elegant expensive high yes 202 large elegant expensive a_bit_high yes 203 large elegant expensive common yes 204 large elegant expensive low no 205 large elegant a_litle_expensive high yes 206 large elegant a_litle_expensive a_bit_high no 207 large elegant a_litle_expensive common yes 208 large elegant a_litle_expensive low no 209 large elegant affordable high yes 210 large elegant affordable a_bit_high no 211 large elegant affordable common yes 212 large elegant affordable low no 213 large elegant substantial high no 214 large elegant substantial a_bit_high yes 215 large elegant substantial common no 216 large elegant substantial low no 217 large elegant cheap high yes 218 large elegant cheap a_bit_high no 219 large elegant cheap common yes 220 large elegant cheap low no 221 large slight expensive high no 222 large slight expensive a_bit_high no 223 large slight expensive common no 224 large slight expensive low no 225 large slight a_litle_expensive high yes 226 large slight a_litle_expensive a_bit_high yes 227 large slight a_litle_expensive common no 228 large slight a_litle_expensive low no 229 large slight affordable high yes 230 large slight affordable a_bit_high yes 231 large slight affordable common yes 232 large slight affordable low no 233 large slight substantial high yes 234 large slight substantial a_bit_high no 235 large slight substantial common yes 236 large slight substantial low no 237 large slight cheap high no 238 large slight cheap a_bit_high yes 239 large slight cheap common yes 240 large slight cheap low no end

转载请注明原文地址: https://www.6miu.com/read-23617.html

最新回复(0)