1+ #-*-coding:utf-8-*-
2+ '''
3+ Created on 2016年5月9日
4+
5+ @author: Gamer Think
6+ '''
7+
8+ #定义一个树,保存树的每一个结点
9+ class treeNode :
10+ def __init__ (self ,nameValue , numOccur , parentNode ):
11+ self .name = nameValue
12+ self .count = numOccur
13+ self .parent = parentNode
14+ self .children = {} #用于存放节点的子节点
15+ self .nodeLink = None #用于连接相似的元素项
16+
17+ #对count变量增加给定值
18+ def inc (self , numOccur ):
19+ self .count += numOccur
20+
21+ #用于将树以文本形式显示,对于构建树来说并不是需要的
22+ def disp (self , ind = 1 ):
23+ print " " * ind , self .name , " " ,self .count
24+ for child in self .children .values ():
25+ child .disp (ind + 1 )
26+
27+ #FP树的构建函数
28+ def createTree (dataSet , minSup = 1 ):
29+ ''' 创建FP树 '''
30+ # 第一次遍历数据集,创建头指针表
31+ headerTable = {}
32+ for trans in dataSet :
33+ for item in trans :
34+ headerTable [item ] = headerTable .get (item , 0 ) + dataSet [trans ]
35+ # 移除不满足最小支持度的元素项
36+ for k in headerTable .keys ():
37+ if headerTable [k ] < minSup :
38+ del (headerTable [k ])
39+ # 空元素集,返回空
40+ freqItemSet = set (headerTable .keys ())
41+ if len (freqItemSet ) == 0 :
42+ return None , None
43+ # 增加一个数据项,用于存放指向相似元素项指针
44+ for k in headerTable :
45+ headerTable [k ] = [headerTable [k ], None ]
46+ retTree = treeNode ('Null Set' , 1 , None ) # 根节点
47+ # 第二次遍历数据集,创建FP树
48+ for tranSet , count in dataSet .items ():
49+ localD = {} # 对一个项集tranSet,记录其中每个元素项的全局频率,用于排序
50+ for item in tranSet :
51+ if item in freqItemSet :
52+ localD [item ] = headerTable [item ][0 ] # 注意这个[0],因为之前加过一个数据项
53+ if len (localD ) > 0 :
54+ orderedItems = [v [0 ] for v in sorted (localD .items (), key = lambda p : p [1 ], reverse = True )] # 排序
55+ updateTree (orderedItems , retTree , headerTable , count ) # 更新FP树
56+ return retTree , headerTable
57+
58+ def updateTree (items , inTree , headerTable ,count ):
59+ #判断事务中的第一个元素项是否作为子节点存在,如果存在则更新该元素项的计数
60+ if items [0 ] in inTree .children :
61+ inTree .children [items [0 ]].inc (count )
62+ #如果不存在,则创建一个新的treeeNode并将其作为子节点添加到树中
63+ else :
64+ inTree .children [items [0 ]] = treeNode (items [0 ],count ,inTree )
65+ # 更新头指针表或前一个相似元素项节点的指针指向新节点
66+ if headerTable [items [0 ]][1 ]== None :
67+ headerTable [items [0 ]][1 ] = inTree .children [items [0 ]]
68+ else :
69+ updateHeader (headerTable [items [0 ]][1 ],inTree .children [items [0 ]])
70+ # 对剩下的元素项迭代调用updateTree函数
71+ if len (items ) > 1 :
72+ updateTree (items [1 ::], inTree .children [items [0 ]], headerTable , count )
73+
74+ #获取头指针表中该元素项对应的单链表的尾节点,然后将其指向新节点targetNode
75+ def updateHeader (nodeToTest , targetNode ):
76+ while (nodeToTest .nodeLink != None ):
77+ nodeToTest = nodeToTest .nodeLink
78+ nodeToTest .nodeLink = targetNode
79+
80+ #生成数据集
81+ def loadSimpDat ():
82+ simpDat = [['r' , 'z' , 'h' , 'j' , 'p' ],
83+ ['z' , 'y' , 'x' , 'w' , 'v' , 'u' , 't' , 's' ],
84+ ['z' ],
85+ ['r' , 'x' , 'n' , 'o' , 's' ],
86+ ['y' , 'r' , 'x' , 'z' , 'q' , 't' , 'p' ],
87+ ['y' , 'z' , 'x' , 'e' , 'q' , 's' , 't' , 'm' ]]
88+ return simpDat
89+
90+ def createInitSet (dataSet ):
91+ retDict = {}
92+ for trans in dataSet :
93+ retDict [frozenset (trans )] = 1
94+ return retDict
95+ #=========================================================
96+
97+ #给定元素项生成一个条件模式基(前缀路径)
98+ #basePat表示输入的频繁项,treeNode为当前FP树中对应的第一个节点(可在函数外部通过headerTable[basePat][1]获取)
99+ def findPrefixPath (basePat ,treeNode ):
100+ condPats = {}
101+ while treeNode != None :
102+ prefixPath = []
103+ ascendTree (treeNode , prefixPath )
104+ if len (prefixPath ) > 1 :
105+ condPats [frozenset (prefixPath [1 :])] = treeNode .count
106+ treeNode = treeNode .nodeLink
107+ #返回函数的条件模式基
108+ return condPats
109+
110+ #辅助函数,直接修改prefixPath的值,将当前节点leafNode添加到prefixPath的末尾,然后递归添加其父节点
111+ def ascendTree (leafNode , prefixPath ):
112+ if leafNode .parent != None :
113+ prefixPath .append (leafNode .name )
114+ ascendTree (leafNode .parent , prefixPath )
115+
116+ #递归查找频繁项集
117+ #参数:inTree和headerTable是由createTree()函数生成的数据集的FP树
118+ # : minSup表示最小支持度
119+ # :preFix请传入一个空集合(set([])),将在函数中用于保存当前前缀
120+ # :freqItemList请传入一个空列表([]),将用来储存生成的频繁项集
121+ def mineTree (inTree ,headerTable ,minSup ,preFix ,freqItemList ):
122+ bigL = [v [0 ] for v in sorted (headerTable .items (),key = lambda p :p [1 ])]
123+ for basePat in bigL :
124+ newFreqSet = preFix .copy ()
125+ newFreqSet .add (basePat )
126+ freqItemList .append (newFreqSet )
127+ condPattBases = findPrefixPath (basePat , headerTable [basePat ][1 ])
128+ myConTree ,myHead = createTree (condPattBases , minSup )
129+
130+ if myHead != None :
131+ #用于测试
132+ print 'conditional tree for :' , newFreqSet
133+ myConTree .disp ()
134+
135+ mineTree (myConTree , myHead , minSup , newFreqSet , freqItemList )
136+
137+ #封装算法
138+ def fpGrowth (dataSet , minSup = 3 ):
139+ initSet = createInitSet (dataSet )
140+ myFPtree , myHeaderTab = createTree (initSet , minSup )
141+ freqItems = []
142+ mineTree (myFPtree , myHeaderTab , minSup , set ([]), freqItems )
143+ return freqItems
144+
145+ if __name__ == "__main__" :
146+
147+ #测试加载数据集和生成树代码
148+ '''
149+ simpDat = loadSimpDat()
150+ initSet = createInitSet(simpDat)
151+ myFPtree, myHeaderTab = createTree(initSet, 3)
152+ print myFPtree.disp()
153+ '''
154+ #测试findPrefixPath代码
155+ '''
156+ print "x",findPrefixPath('x', myHeaderTab['x'][1])
157+ print "z",findPrefixPath('z', myHeaderTab['z'][1])
158+ print "r",findPrefixPath('r', myHeaderTab['r'][1])
159+ '''
160+ #测试mineTree的代码
161+ '''
162+ freqItems = []
163+ mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
164+ print freqItems
165+ '''
166+ #封装算法后代码测试
167+ dataSet = loadSimpDat ()
168+ freqItems = fpGrowth (dataSet )
169+ print freqItems
170+
0 commit comments