Skip to content

Commit d5bc005

Browse files
committed
Apriori算法的Python代码实现
Apriori算法的Python代码实现
1 parent 8cb7d48 commit d5bc005

File tree

1 file changed

+190
-0
lines changed

1 file changed

+190
-0
lines changed

Apriori/Apriori.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
#-*-coding:utf-8-*-
2+
'''
3+
Created on 2016年5月8日
4+
5+
@author: Gamer Think
6+
'''
7+
from pydoc import apropos
8+
9+
#========================= 准备函数 (下) ==========================================
10+
#加载数据集
11+
def loadDataSet():
12+
return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
13+
14+
def createC1(dataSet):
15+
C1 = [] #C1为大小为1的项的集合
16+
for transaction in dataSet: #遍历数据集中的每一条交易
17+
for item in transaction: #遍历每一条交易中的每个商品
18+
if not [item] in C1:
19+
C1.append([item])
20+
C1.sort()
21+
#map函数表示遍历C1中的每一个元素执行forzenset,frozenset表示“冰冻”的集合,即不可改变
22+
return map(frozenset,C1)
23+
24+
#Ck表示数据集,D表示候选集合的列表,minSupport表示最小支持度
25+
#该函数用于从C1生成L1,L1表示满足最低支持度的元素集合
26+
def scanD(D,Ck,minSupport):
27+
ssCnt = {}
28+
for tid in D:
29+
for can in Ck:
30+
#issubset:表示如果集合can中的每一元素都在tid中则返回true
31+
if can.issubset(tid):
32+
#统计各个集合scan出现的次数,存入ssCnt字典中,字典的key是集合,value是统计出现的次数
33+
if not ssCnt.has_key(can):
34+
ssCnt[can] = 1
35+
else:
36+
ssCnt[can] += 1
37+
numItems = float(len(D))
38+
retList = []
39+
supportData = {}
40+
for key in ssCnt:
41+
#计算每个项集的支持度,如果满足条件则把该项集加入到retList列表中
42+
support = ssCnt[key]/numItems
43+
if support >= minSupport:
44+
retList.insert(0, key)
45+
#构建支持的项集的字典
46+
supportData[key] = support
47+
return retList,supportData
48+
#==================== 准备函数(上) =============================
49+
50+
#====================== Apriori算法(下) =================================
51+
#Create Ck,CaprioriGen ()的输人参数为频繁项集列表Lk与项集元素个数k,输出为Ck
52+
def aprioriGen(Lk,k):
53+
retList = []
54+
lenLk = len(Lk)
55+
for i in range(lenLk):
56+
for j in range(i+1,lenLk):
57+
#前k-2项相同时合并两个集合
58+
L1 = list(Lk[i])[:k-2]
59+
L2 = list(Lk[j])[:k-2]
60+
L1.sort()
61+
L2.sort()
62+
if L1 == L2:
63+
retList.append(Lk[i] | Lk[j])
64+
65+
return retList
66+
67+
def apriori(dataSet, minSupport=0.5):
68+
C1 = createC1(dataSet) #创建C1
69+
#D: [set([1, 3, 4]), set([2, 3, 5]), set([1, 2, 3, 5]), set([2, 5])]
70+
D = map(set,dataSet)
71+
L1,supportData = scanD(D, C1, minSupport)
72+
L = [L1]
73+
#若两个项集的长度为k - 1,则必须前k-2项相同才可连接,即求并集,所以[:k-2]的实际作用为取列表的前k-1个元素
74+
k = 2
75+
while(len(L[k-2]) > 0):
76+
Ck = aprioriGen(L[k-2], k)
77+
Lk,supK = scanD(D,Ck, minSupport)
78+
supportData.update(supK)
79+
L.append(Lk)
80+
k +=1
81+
return L,supportData
82+
#====================== Apriori算法(上) =================================
83+
84+
85+
#======================== 关联规则生成函数 ========================
86+
#调用下边两个函数
87+
#L:表示频繁项集列表,supportData:包含那些频繁项集支持数据的字典,minConf:表示最小可信度阀值
88+
def generateRules(L, supportData,minConf = 0.7):
89+
bigRuleList = [] #存放可信度,后面可以根据可信度排名
90+
for i in range(1,len(L)):
91+
for freqSet in L[i]:
92+
H1 = [frozenset([item]) for item in freqSet]
93+
if (i>1):
94+
#如果项集的元素数目超过2,则使用下面的函数对他进行下一步的合并,合并函数如下
95+
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
96+
else:
97+
#如果项集中只有两个元素,则使用下面的函数计算可信度
98+
calcConf(freqSet,H1,supportData,bigRuleList,minConf)
99+
100+
return bigRuleList
101+
102+
#第一次修改,出现丢失的那几个关联规则
103+
def generateRules2(L, supportData, minConf=0.7):
104+
bigRuleList = []
105+
for i in range(1, len(L)):
106+
for freqSet in L[i]:
107+
H1 = [frozenset([item]) for item in freqSet]
108+
if (i > 1):
109+
# 三个及以上元素的集合
110+
H1 = calcConf(freqSet, H1, supportData, bigRuleList, minConf)
111+
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
112+
else:
113+
# 两个元素的集合
114+
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
115+
return bigRuleList
116+
117+
#第二次修改,简化函数,和第一步修改结果相同
118+
def generateRules3(L, supportData, minConf=0.7):
119+
bigRuleList = []
120+
for i in range(1, len(L)):
121+
for freqSet in L[i]:
122+
H1 = [frozenset([item]) for item in freqSet]
123+
rulesFromConseq2(freqSet, H1, supportData, bigRuleList, minConf)
124+
return bigRuleList
125+
126+
def rulesFromConseq2(freqSet, H, supportData, brl, minConf=0.7):
127+
m = len(H[0])
128+
if (len(freqSet) > m): # 判断长度改为 > m,这时即可以求H的可信度
129+
Hmpl = calcConf(freqSet, H, supportData, brl, minConf)
130+
if (len(Hmpl) > 1): # 判断求完可信度后是否还有可信度大于阈值的项用来生成下一层H
131+
Hmpl = aprioriGen(Hmpl, m + 1)
132+
rulesFromConseq2(freqSet, Hmpl, supportData, brl, minConf) # 递归计算,不变
133+
134+
#第三次修改 消除rulesFromConseq2()函数中的递归项,去掉了多余的Hmpl变量,运行结果和上面相同
135+
def rulesFromConseq3(freqSet, H, supportData, brl, minConf=0.7):
136+
m = len(H[0])
137+
while (len(freqSet) > m): # 判断长度 > m,这时即可求H的可信度
138+
H = calcConf(freqSet, H, supportData, brl, minConf)
139+
if (len(H) > 1): # 判断求完可信度后是否还有可信度大于阈值的项用来生成下一层H
140+
H = aprioriGen(H, m + 1)
141+
m += 1
142+
else: # 不能继续生成下一层候选关联规则,提前退出循环
143+
break
144+
145+
#计算规则的可信度,并找到满足最小可信度的规则存放在prunedH中,作为返回值返回
146+
def calcConf(freqSet,H,supportData, br1, minConf=0.7):
147+
prunedH = []
148+
for conseq in H:
149+
conf = supportData[freqSet]/supportData[freqSet - conseq]
150+
151+
if conf>= minConf:
152+
print freqSet-conseq,"-->",conseq ,"conf:",conf
153+
br1.append((freqSet-conseq,conseq,conf)) #填充可信度列表
154+
prunedH.append(conseq) #保存满足最小置信度的规则
155+
return prunedH
156+
157+
#从最初的项集中产生更多的关联规则,H为当前的候选规则集,产生下一层的候选规则集
158+
#freqSet:频繁项集 H:可以出现在规则右部的元素列表 supportData:保存项集的支持度,brl保存生成的关联规则,minConf:最小可信度阀值
159+
def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.7):
160+
m = len(H[0])
161+
if (len(freqSet) >(m +1)):
162+
Hmp1 = aprioriGen( H, m+1)
163+
Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf)
164+
if (len(Hmp1) >1):
165+
rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf)
166+
167+
168+
if __name__=="__main__":
169+
dataSet = loadDataSet()
170+
L,suppData = apriori(dataSet)
171+
i = 0
172+
for one in L:
173+
print "项数为 %s 的频繁项集:" % (i + 1), one,"\n"
174+
i +=1
175+
176+
print "generateRules3:\nminConf=0.7时:"
177+
rules = generateRules(L,suppData, minConf=0.7)
178+
print "\nminConf=0.5时:"
179+
rules = generateRules(L,suppData, minConf=0.5)
180+
181+
print "generateRules2:\nminConf=0.7时:"
182+
rules = generateRules2(L,suppData, minConf=0.7)
183+
print "minConf=0.5时:"
184+
rules = generateRules2(L,suppData, minConf=0.5)
185+
186+
187+
print "generateRules3:\nminConf=0.7时:"
188+
rules = generateRules3(L,suppData, minConf=0.7)
189+
print "minConf=0.5时:"
190+
rules = generateRules3(L,suppData, minConf=0.5)

0 commit comments

Comments
 (0)