决策树 | WORIA

特征选择

信息增益

1.1 熵

随机变量 X ，其分布概率为 $P(X=x_i)=p_i , i=1,2,\cdots,n$

熵定义为 :

$$H(X)=-\sum_{i=1}^np_i\log {p_i}$$

若 $p_i=0$ ，则定义 $0\log 0=0$

1.2 条件熵

随机变量 Y 在 X 条件下的熵。

定义为：X 给定的条件下，Y 的条件概率分布的熵对 X 的数学期望。

$H(Y|X)=\sum_{i=1}^n p_iH(Y|X=x_i)$

1.3 信息增益

已知特征 X 而使 Y 的不确定性减少的程度。

$g(D,A)=H(D)-H(D|A)$

$D$ 表示输入空间， $A$ 表示特征。

ID3算法

2.1 算法

(1) 若 D 中所有实例属于同一类 $C_k$ ，则 T 为单节点树，将 $C_k$ 作为该结点的类标记，返回 T ；

(2) 若 $A =\varnothing$ . 则 T 为单节点树，将 D 中包含实例数最大的类 $C_k$ 作为该结点的类标记，返回 T ；

(3) 否则，计算信息增益，选择信息增益最大的特征 $A_g$ ；

(4) 如果 $A_g$ 的信息增益小于设定的阈值 $\varepsilon$ ，则置 T 为单节点树，将 D 中包含实例数最大的类 $C_k$ 作为该结点的类标记，返回 T ；

(5) 否则，对$ A_g$ 每个可能的选择将 D 分为若干个子集，以其包含实例数最大的 $C_k$ 作为该结点的类标记；构建子节点。

(6) 对第 i 个子节点，以 $D_i$ 为数据集，以 $A-{A_g}$ 为特征集，递归调用 (1)~(5)，得到子树 $T_i$ ，返回 $T_i$ 。

2.2 代码

#include<iostream>
#include<fstream>
#include<vector>
#include<cstring>
#include<cmath>
#include<algorithm>
using namespace std;
const int maxn = 100;	//最大数据数
const int maxk = 100;	//特征最大种类数
int K;
vector<vector<int> >dat;	//初始输入空间
vector<int>character;		//初始特征空间

double log_2(double a) {	//以2为底log
	if (a == 0)return 0.0;
	return log(a) / log(2);
}
struct Node {
	vector<Node*>child;
	int num_char = 0;
	vector<vector<int> >val;
	vector<int>cha;
	int label = 0;
	Node() {};
	Node(vector<vector<int> > a,vector<int> b):val(a),cha(b) {};
};

double comput_pd(vector<vector<int> >arr) {			//计算P(D)
	int N_now = 0;
	int tot = 0;
	for (auto c:arr) {
		N_now++;
		tot += c[K];
	}
	double res = -(1.0*tot/N_now)*log_2(1.0*tot / N_now)-(1.0*(N_now-tot)/N_now)*log_2(1.0*(N_now - tot) / N_now);
	return res;
}
double compute_pda(int chara,vector<vector<int> >arr) {	//计算P(D|Ai),第chara个特征
	double res=0;
	vector<vector<int> >p[maxk];
	vector<int>cnt;
	int N_now=0;
	for (int i = 0; i < character[chara]; i++)
		cnt.push_back(0);
	for (auto c : arr) {
		N_now++;
		p[c[chara]].push_back(c);
		cnt[c[chara]]++;
	}
	for (int i = 0; i < character[chara]; i++) {
		res += (cnt[i] * 1.0 / N_now)*comput_pd(p[i]);
	}
	return res;
}
double compute(int chara,vector<vector<int> >arr) {
	return comput_pd(arr) - compute_pda(chara,arr);
}
int find_most(const vector<vector<int> >& arr) {	//找到包含实例最多的类
	int a = 0;
	int b = 0;
	for (auto c : arr) {
		if (c[K] == 1)
			a++;
		else b++;
	}
	return a > b ? 1 : 0;
}

Node* build(const vector<vector<int> >& arr,const vector<int>& car) {//建树
	Node* t = new Node;
	t->cha = car;
	t->val = arr;
	int a = 0;
	int b = 0;
	for (auto c : t->val) {
		if (c[K] == 1)a++;
		else b++;
	}
	if (a == 0 ) {				//数据类别单一
		t->label = 0;
		return t;
	}
	if (b == 0) {
		t->label = 1;
		return t;
	}
	if (t->cha.empty()) {			//特征空间为空
		t->label = a > b ? 1 : 0;
		return t;
	}
	t->label = find_most(t->val);
	double mak = 0;
	int tmp;
	for (auto c:t->cha) {			//找到信息增益最大的特征
		double a = compute(c, t->val);
		if (a > mak) {
			mak = a;
			tmp = c;
		}
	}
	t->num_char = tmp;
	vector<int>ca_new;
	for (auto c : t->cha) {				//新的特征空间
		if (c != tmp)ca_new.push_back(c);
	}
	for (int i = 0; i < character[tmp]; i++) {	//新的输入空间
		vector<vector<int> >at;
		for (auto c : t->val) {
			if (c[tmp]==i)
				at.push_back(c);
		}
		t->child.push_back(build(at,ca_new));
	}
	return t;
}
//搜索决策树
int search(vector<int>arr, Node* root) {
	if (root->child.empty()) {
		return root->label;
	}
	int next_ch = arr[root->num_char];
	return search(arr, root->child[next_ch]);
}

int main() {
	ifstream fin;
	fin.open("train.txt");
	K = 4;						//特征数
	while (!fin.eof()) {
		vector<int>tmp;
		for (int i = 0; i <= K; i++) {
			int _a;
			fin >> _a;
			tmp.push_back(_a);	//初始输入空间
		}
		dat.push_back(tmp);
	}
	fin.close();
	vector<int>cara;
	character.push_back(3); character.push_back(2); character.push_back(2); character.push_back(3);
	for (int i = 0; i < K; i++)
		cara.push_back(i);		//初始特征空间
	Node* root=build(dat,cara);

	//test
	vector<int>test;
	test.push_back(2); test.push_back(1); test.push_back(0); test.push_back(1);
	cout<<search(test, root);
	return 0;
}