1+ #-*-coding:utf-8-*-
2+ '''
3+ Created on 2016年5月8日
4+
5+ @author: Gamer Think
6+ '''
7+ from pydoc import apropos
8+
9+ #========================= 准备函数 (下) ==========================================
10+ #加载数据集
11+ def loadDataSet ():
12+ return [[1 ,3 ,4 ],[2 ,3 ,5 ],[1 ,2 ,3 ,5 ],[2 ,5 ]]
13+
14+ def createC1 (dataSet ):
15+ C1 = [] #C1为大小为1的项的集合
16+ for transaction in dataSet : #遍历数据集中的每一条交易
17+ for item in transaction : #遍历每一条交易中的每个商品
18+ if not [item ] in C1 :
19+ C1 .append ([item ])
20+ C1 .sort ()
21+ #map函数表示遍历C1中的每一个元素执行forzenset,frozenset表示“冰冻”的集合,即不可改变
22+ return map (frozenset ,C1 )
23+
24+ #Ck表示数据集,D表示候选集合的列表,minSupport表示最小支持度
25+ #该函数用于从C1生成L1,L1表示满足最低支持度的元素集合
26+ def scanD (D ,Ck ,minSupport ):
27+ ssCnt = {}
28+ for tid in D :
29+ for can in Ck :
30+ #issubset:表示如果集合can中的每一元素都在tid中则返回true
31+ if can .issubset (tid ):
32+ #统计各个集合scan出现的次数,存入ssCnt字典中,字典的key是集合,value是统计出现的次数
33+ if not ssCnt .has_key (can ):
34+ ssCnt [can ] = 1
35+ else :
36+ ssCnt [can ] += 1
37+ numItems = float (len (D ))
38+ retList = []
39+ supportData = {}
40+ for key in ssCnt :
41+ #计算每个项集的支持度,如果满足条件则把该项集加入到retList列表中
42+ support = ssCnt [key ]/ numItems
43+ if support >= minSupport :
44+ retList .insert (0 , key )
45+ #构建支持的项集的字典
46+ supportData [key ] = support
47+ return retList ,supportData
48+ #==================== 准备函数(上) =============================
49+
50+ #====================== Apriori算法(下) =================================
51+ #Create Ck,CaprioriGen ()的输人参数为频繁项集列表Lk与项集元素个数k,输出为Ck
52+ def aprioriGen (Lk ,k ):
53+ retList = []
54+ lenLk = len (Lk )
55+ for i in range (lenLk ):
56+ for j in range (i + 1 ,lenLk ):
57+ #前k-2项相同时合并两个集合
58+ L1 = list (Lk [i ])[:k - 2 ]
59+ L2 = list (Lk [j ])[:k - 2 ]
60+ L1 .sort ()
61+ L2 .sort ()
62+ if L1 == L2 :
63+ retList .append (Lk [i ] | Lk [j ])
64+
65+ return retList
66+
67+ def apriori (dataSet , minSupport = 0.5 ):
68+ C1 = createC1 (dataSet ) #创建C1
69+ #D: [set([1, 3, 4]), set([2, 3, 5]), set([1, 2, 3, 5]), set([2, 5])]
70+ D = map (set ,dataSet )
71+ L1 ,supportData = scanD (D , C1 , minSupport )
72+ L = [L1 ]
73+ #若两个项集的长度为k - 1,则必须前k-2项相同才可连接,即求并集,所以[:k-2]的实际作用为取列表的前k-1个元素
74+ k = 2
75+ while (len (L [k - 2 ]) > 0 ):
76+ Ck = aprioriGen (L [k - 2 ], k )
77+ Lk ,supK = scanD (D ,Ck , minSupport )
78+ supportData .update (supK )
79+ L .append (Lk )
80+ k += 1
81+ return L ,supportData
82+ #====================== Apriori算法(上) =================================
83+
84+
85+ #======================== 关联规则生成函数 ========================
86+ #调用下边两个函数
87+ #L:表示频繁项集列表,supportData:包含那些频繁项集支持数据的字典,minConf:表示最小可信度阀值
88+ def generateRules (L , supportData ,minConf = 0.7 ):
89+ bigRuleList = [] #存放可信度,后面可以根据可信度排名
90+ for i in range (1 ,len (L )):
91+ for freqSet in L [i ]:
92+ H1 = [frozenset ([item ]) for item in freqSet ]
93+ if (i > 1 ):
94+ #如果项集的元素数目超过2,则使用下面的函数对他进行下一步的合并,合并函数如下
95+ rulesFromConseq (freqSet , H1 , supportData , bigRuleList , minConf )
96+ else :
97+ #如果项集中只有两个元素,则使用下面的函数计算可信度
98+ calcConf (freqSet ,H1 ,supportData ,bigRuleList ,minConf )
99+
100+ return bigRuleList
101+
102+ #第一次修改,出现丢失的那几个关联规则
103+ def generateRules2 (L , supportData , minConf = 0.7 ):
104+ bigRuleList = []
105+ for i in range (1 , len (L )):
106+ for freqSet in L [i ]:
107+ H1 = [frozenset ([item ]) for item in freqSet ]
108+ if (i > 1 ):
109+ # 三个及以上元素的集合
110+ H1 = calcConf (freqSet , H1 , supportData , bigRuleList , minConf )
111+ rulesFromConseq (freqSet , H1 , supportData , bigRuleList , minConf )
112+ else :
113+ # 两个元素的集合
114+ calcConf (freqSet , H1 , supportData , bigRuleList , minConf )
115+ return bigRuleList
116+
117+ #第二次修改,简化函数,和第一步修改结果相同
118+ def generateRules3 (L , supportData , minConf = 0.7 ):
119+ bigRuleList = []
120+ for i in range (1 , len (L )):
121+ for freqSet in L [i ]:
122+ H1 = [frozenset ([item ]) for item in freqSet ]
123+ rulesFromConseq2 (freqSet , H1 , supportData , bigRuleList , minConf )
124+ return bigRuleList
125+
126+ def rulesFromConseq2 (freqSet , H , supportData , brl , minConf = 0.7 ):
127+ m = len (H [0 ])
128+ if (len (freqSet ) > m ): # 判断长度改为 > m,这时即可以求H的可信度
129+ Hmpl = calcConf (freqSet , H , supportData , brl , minConf )
130+ if (len (Hmpl ) > 1 ): # 判断求完可信度后是否还有可信度大于阈值的项用来生成下一层H
131+ Hmpl = aprioriGen (Hmpl , m + 1 )
132+ rulesFromConseq2 (freqSet , Hmpl , supportData , brl , minConf ) # 递归计算,不变
133+
134+ #第三次修改 消除rulesFromConseq2()函数中的递归项,去掉了多余的Hmpl变量,运行结果和上面相同
135+ def rulesFromConseq3 (freqSet , H , supportData , brl , minConf = 0.7 ):
136+ m = len (H [0 ])
137+ while (len (freqSet ) > m ): # 判断长度 > m,这时即可求H的可信度
138+ H = calcConf (freqSet , H , supportData , brl , minConf )
139+ if (len (H ) > 1 ): # 判断求完可信度后是否还有可信度大于阈值的项用来生成下一层H
140+ H = aprioriGen (H , m + 1 )
141+ m += 1
142+ else : # 不能继续生成下一层候选关联规则,提前退出循环
143+ break
144+
145+ #计算规则的可信度,并找到满足最小可信度的规则存放在prunedH中,作为返回值返回
146+ def calcConf (freqSet ,H ,supportData , br1 , minConf = 0.7 ):
147+ prunedH = []
148+ for conseq in H :
149+ conf = supportData [freqSet ]/ supportData [freqSet - conseq ]
150+
151+ if conf >= minConf :
152+ print freqSet - conseq ,"-->" ,conseq ,"conf:" ,conf
153+ br1 .append ((freqSet - conseq ,conseq ,conf )) #填充可信度列表
154+ prunedH .append (conseq ) #保存满足最小置信度的规则
155+ return prunedH
156+
157+ #从最初的项集中产生更多的关联规则,H为当前的候选规则集,产生下一层的候选规则集
158+ #freqSet:频繁项集 H:可以出现在规则右部的元素列表 supportData:保存项集的支持度,brl保存生成的关联规则,minConf:最小可信度阀值
159+ def rulesFromConseq (freqSet , H , supportData , br1 , minConf = 0.7 ):
160+ m = len (H [0 ])
161+ if (len (freqSet ) > (m + 1 )):
162+ Hmp1 = aprioriGen ( H , m + 1 )
163+ Hmp1 = calcConf (freqSet , Hmp1 , supportData , br1 , minConf )
164+ if (len (Hmp1 ) > 1 ):
165+ rulesFromConseq (freqSet , Hmp1 , supportData , br1 , minConf )
166+
167+
168+ if __name__ == "__main__" :
169+ dataSet = loadDataSet ()
170+ L ,suppData = apriori (dataSet )
171+ i = 0
172+ for one in L :
173+ print "项数为 %s 的频繁项集:" % (i + 1 ), one ,"\n "
174+ i += 1
175+
176+ print "generateRules3:\n minConf=0.7时:"
177+ rules = generateRules (L ,suppData , minConf = 0.7 )
178+ print "\n minConf=0.5时:"
179+ rules = generateRules (L ,suppData , minConf = 0.5 )
180+
181+ print "generateRules2:\n minConf=0.7时:"
182+ rules = generateRules2 (L ,suppData , minConf = 0.7 )
183+ print "minConf=0.5时:"
184+ rules = generateRules2 (L ,suppData , minConf = 0.5 )
185+
186+
187+ print "generateRules3:\n minConf=0.7时:"
188+ rules = generateRules3 (L ,suppData , minConf = 0.7 )
189+ print "minConf=0.5时:"
190+ rules = generateRules3 (L ,suppData , minConf = 0.5 )
0 commit comments