Skip to content

Commit 7c5f55a

Browse files
committed
FP-Tree算法的Python代码实现以及新闻点击流分析实例
FP-Tree算法的Python代码实现以及新闻点击流分析实例 两个py文件,一个是FP-Tree算法,另外一个是使用该算法来进行新闻点击流分析
1 parent 1702bb8 commit 7c5f55a

File tree

2 files changed

+193
-0
lines changed

2 files changed

+193
-0
lines changed

FP-growth/FP_Tree.py

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
#-*-coding:utf-8-*-
2+
'''
3+
Created on 2016年5月9日
4+
5+
@author: Gamer Think
6+
'''
7+
8+
#定义一个树,保存树的每一个结点
9+
class treeNode:
10+
def __init__(self,nameValue, numOccur, parentNode):
11+
self.name = nameValue
12+
self.count = numOccur
13+
self.parent = parentNode
14+
self.children = {} #用于存放节点的子节点
15+
self.nodeLink = None #用于连接相似的元素项
16+
17+
#对count变量增加给定值
18+
def inc(self, numOccur):
19+
self.count += numOccur
20+
21+
#用于将树以文本形式显示,对于构建树来说并不是需要的
22+
def disp(self, ind = 1):
23+
print " " * ind, self.name, " ",self.count
24+
for child in self.children.values():
25+
child.disp(ind + 1)
26+
27+
#FP树的构建函数
28+
def createTree(dataSet, minSup=1):
29+
''' 创建FP树 '''
30+
# 第一次遍历数据集,创建头指针表
31+
headerTable = {}
32+
for trans in dataSet:
33+
for item in trans:
34+
headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
35+
# 移除不满足最小支持度的元素项
36+
for k in headerTable.keys():
37+
if headerTable[k] < minSup:
38+
del(headerTable[k])
39+
# 空元素集,返回空
40+
freqItemSet = set(headerTable.keys())
41+
if len(freqItemSet) == 0:
42+
return None, None
43+
# 增加一个数据项,用于存放指向相似元素项指针
44+
for k in headerTable:
45+
headerTable[k] = [headerTable[k], None]
46+
retTree = treeNode('Null Set', 1, None) # 根节点
47+
# 第二次遍历数据集,创建FP树
48+
for tranSet, count in dataSet.items():
49+
localD = {} # 对一个项集tranSet,记录其中每个元素项的全局频率,用于排序
50+
for item in tranSet:
51+
if item in freqItemSet:
52+
localD[item] = headerTable[item][0] # 注意这个[0],因为之前加过一个数据项
53+
if len(localD) > 0:
54+
orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)] # 排序
55+
updateTree(orderedItems, retTree, headerTable, count) # 更新FP树
56+
return retTree, headerTable
57+
58+
def updateTree(items, inTree, headerTable,count):
59+
#判断事务中的第一个元素项是否作为子节点存在,如果存在则更新该元素项的计数
60+
if items[0] in inTree.children:
61+
inTree.children[items[0]].inc(count)
62+
#如果不存在,则创建一个新的treeeNode并将其作为子节点添加到树中
63+
else:
64+
inTree.children[items[0]] = treeNode(items[0],count,inTree)
65+
# 更新头指针表或前一个相似元素项节点的指针指向新节点
66+
if headerTable[items[0]][1]==None:
67+
headerTable[items[0]][1] = inTree.children[items[0]]
68+
else:
69+
updateHeader(headerTable[items[0]][1],inTree.children[items[0]])
70+
# 对剩下的元素项迭代调用updateTree函数
71+
if len(items) > 1:
72+
updateTree(items[1::], inTree.children[items[0]], headerTable, count)
73+
74+
#获取头指针表中该元素项对应的单链表的尾节点,然后将其指向新节点targetNode
75+
def updateHeader(nodeToTest, targetNode):
76+
while (nodeToTest.nodeLink != None):
77+
nodeToTest = nodeToTest.nodeLink
78+
nodeToTest.nodeLink = targetNode
79+
80+
#生成数据集
81+
def loadSimpDat():
82+
simpDat = [['r', 'z', 'h', 'j', 'p'],
83+
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
84+
['z'],
85+
['r', 'x', 'n', 'o', 's'],
86+
['y', 'r', 'x', 'z', 'q', 't', 'p'],
87+
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
88+
return simpDat
89+
90+
def createInitSet(dataSet):
91+
retDict = {}
92+
for trans in dataSet:
93+
retDict[frozenset(trans)] = 1
94+
return retDict
95+
#=========================================================
96+
97+
#给定元素项生成一个条件模式基(前缀路径)
98+
#basePat表示输入的频繁项,treeNode为当前FP树中对应的第一个节点(可在函数外部通过headerTable[basePat][1]获取)
99+
def findPrefixPath(basePat,treeNode):
100+
condPats = {}
101+
while treeNode != None:
102+
prefixPath = []
103+
ascendTree(treeNode, prefixPath)
104+
if len(prefixPath) > 1:
105+
condPats[frozenset(prefixPath[1:])] = treeNode.count
106+
treeNode = treeNode.nodeLink
107+
#返回函数的条件模式基
108+
return condPats
109+
110+
#辅助函数,直接修改prefixPath的值,将当前节点leafNode添加到prefixPath的末尾,然后递归添加其父节点
111+
def ascendTree(leafNode, prefixPath):
112+
if leafNode.parent != None:
113+
prefixPath.append(leafNode.name)
114+
ascendTree(leafNode.parent, prefixPath)
115+
116+
#递归查找频繁项集
117+
#参数:inTree和headerTable是由createTree()函数生成的数据集的FP树
118+
# : minSup表示最小支持度
119+
# :preFix请传入一个空集合(set([])),将在函数中用于保存当前前缀
120+
# :freqItemList请传入一个空列表([]),将用来储存生成的频繁项集
121+
def mineTree(inTree,headerTable,minSup,preFix,freqItemList):
122+
bigL = [v[0] for v in sorted(headerTable.items(),key = lambda p:p[1])]
123+
for basePat in bigL:
124+
newFreqSet = preFix.copy()
125+
newFreqSet.add(basePat)
126+
freqItemList.append(newFreqSet)
127+
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
128+
myConTree,myHead = createTree(condPattBases, minSup)
129+
130+
if myHead != None:
131+
#用于测试
132+
print 'conditional tree for :', newFreqSet
133+
myConTree.disp()
134+
135+
mineTree(myConTree, myHead, minSup, newFreqSet, freqItemList)
136+
137+
#封装算法
138+
def fpGrowth(dataSet, minSup=3):
139+
initSet = createInitSet(dataSet)
140+
myFPtree, myHeaderTab = createTree(initSet, minSup)
141+
freqItems = []
142+
mineTree(myFPtree, myHeaderTab, minSup, set([]), freqItems)
143+
return freqItems
144+
145+
if __name__=="__main__":
146+
147+
#测试加载数据集和生成树代码
148+
'''
149+
simpDat = loadSimpDat()
150+
initSet = createInitSet(simpDat)
151+
myFPtree, myHeaderTab = createTree(initSet, 3)
152+
print myFPtree.disp()
153+
'''
154+
#测试findPrefixPath代码
155+
'''
156+
print "x",findPrefixPath('x', myHeaderTab['x'][1])
157+
print "z",findPrefixPath('z', myHeaderTab['z'][1])
158+
print "r",findPrefixPath('r', myHeaderTab['r'][1])
159+
'''
160+
#测试mineTree的代码
161+
'''
162+
freqItems = []
163+
mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
164+
print freqItems
165+
'''
166+
#封装算法后代码测试
167+
dataSet = loadSimpDat()
168+
freqItems = fpGrowth(dataSet)
169+
print freqItems
170+

FP-growth/newsClickStream.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#-*-coding:utf-8-*-
2+
'''
3+
Created on 2016年5月9日
4+
5+
@author: Gamer Think
6+
'''
7+
import FP_Tree
8+
9+
#将数据集加载到列表
10+
parsedDat = [line.split() for line in open('kosarak.dat').readlines()]
11+
print parsedDat
12+
13+
#初始集合格式化
14+
initSet = FP_Tree.createInitSet(parsedDat)
15+
16+
#构建FP树
17+
myFPtree, myHeaderTab = FP_Tree.createTree(initSet, 100000)
18+
19+
#创建空列表,保存频繁项集
20+
myFreqList = []
21+
FP_Tree.mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreqList)
22+
print len(myFreqList)
23+
print myFreqList

0 commit comments

Comments
 (0)