Skip to content

Commit 7e3c9b2

Browse files
committed
基于信息论的三种决策树算法之ID3算法
理论部分参考:http://blog.csdn.net/gamer_gyt/article/details/51242815
1 parent 15837e3 commit 7e3c9b2

File tree

1 file changed

+96
-0
lines changed

1 file changed

+96
-0
lines changed

Decision-Tree/DecisionTree-ID3.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#coding=utf-8
2+
'''
3+
'''
4+
from math import log
5+
import operator
6+
7+
def createDataSet():
8+
dataSet =[[1,1,'yes'],
9+
[1,1,'yes'],
10+
[1,0,'no'],
11+
[0,1,'no'],
12+
[0,1,'no']]
13+
labels = ['no surfacing','flippers'] #分类的属性
14+
return dataSet,labels
15+
16+
#计算给定数据的香农熵
17+
def calcShannonEnt(dataSet):
18+
numEntries = len(dataSet)
19+
labelCounts = {}
20+
for featVec in dataSet:
21+
currentLabel = featVec[-1] #获得标签
22+
#构造存放标签的字典
23+
if currentLabel not in labelCounts.keys():
24+
labelCounts[currentLabel]=0
25+
labelCounts[currentLabel]+=1 #对应的标签数目+1
26+
#计算香农熵
27+
shannonEnt = 0.0
28+
for key in labelCounts:
29+
prob = float(labelCounts[key])/numEntries
30+
shannonEnt -=prob*log(prob,2)
31+
return shannonEnt
32+
33+
#划分数据集,三个参数为带划分的数据集,划分数据集的特征,特征的返回值
34+
def splitDataSet(dataSet,axis,value):
35+
retDataSet = []
36+
for featVec in dataSet:
37+
if featVec[axis] ==value:
38+
#将相同数据集特征的抽取出来
39+
reducedFeatVec = featVec[:axis]
40+
reducedFeatVec.extend(featVec[axis+1:])
41+
retDataSet.append(reducedFeatVec)
42+
return retDataSet #返回一个列表
43+
44+
#选择最好的数据集划分方式
45+
def chooseBestFeatureToSplit(dataSet):
46+
numFeature = len(dataSet[0])-1
47+
baseEntropy = calcShannonEnt(dataSet)
48+
bestInfoGain = 0.0
49+
beatFeature = -1
50+
for i in range(numFeature):
51+
featureList = [example[i] for example in dataSet] #获取第i个特征所有的可能取值
52+
uniqueVals = set(featureList) #从列表中创建集合,得到不重复的所有可能取值ֵ
53+
newEntropy = 0.0
54+
for value in uniqueVals:
55+
subDataSet = splitDataSet(dataSet,i,value) #以i为数据集特征,value为返回值,划分数据集
56+
prob = len(subDataSet)/float(len(dataSet)) #数据集特征为i的所占的比例
57+
newEntropy +=prob * calcShannonEnt(subDataSet) #计算每种数据集的信息熵
58+
infoGain = baseEntropy- newEntropy
59+
#计算最好的信息增益,增益越大说明所占决策权越大
60+
if (infoGain > bestInfoGain):
61+
bestInfoGain = infoGain
62+
bestFeature = i
63+
return bestFeature
64+
65+
#递归构建决策树
66+
def majorityCnt(classList):
67+
classCount = {}
68+
for vote in classList:
69+
if vote not in classCount.keys():
70+
classCount[vote]=0
71+
classCount[vote]+=1
72+
sortedClassCount = sorted(classCount.iteritems(),key =operator.itemgetter(1),reverse=True)#排序,True升序
73+
return sortedClassCount[0][0] #返回出现次数最多的
74+
75+
#创建树的函数代码
76+
def createTree(dataSet,labels):
77+
classList = [example[-1] for example in dataSet]
78+
if classList.count(classList[0])==len(classList):#类别完全相同则停止划分
79+
return classList[0]
80+
if len(dataSet[0]) ==1: #遍历完所有特征值时返回出现次数最多的
81+
return majorityCnt(classList)
82+
bestFeat = chooseBestFeatureToSplit(dataSet) #选择最好的数据集划分方式
83+
bestFeatLabel = labels[bestFeat] #得到对应的标签值
84+
myTree = {bestFeatLabel:{}}
85+
del(labels[bestFeat]) #清空labels[bestFeat],在下一次使用时清零
86+
featValues = [example[bestFeat] for example in dataSet]
87+
uniqueVals = set(featValues)
88+
for value in uniqueVals:
89+
subLabels =labels[:]
90+
#递归调用创建决策树函数
91+
myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
92+
return myTree
93+
94+
if __name__=="__main__":
95+
dataSet,labels = createDataSet()
96+
print createTree(dataSet,labels)

0 commit comments

Comments
 (0)