1+ #-*-coding:utf-8-*- 
2+ ''' 
3+ Created on 2016年5月8日 
4+ 
5+ @author: Gamer Think 
6+ ''' 
7+ from  pydoc  import  apropos 
8+ 
9+ #=========================     准备函数 (下)      ========================================== 
10+ #加载数据集 
11+ def  loadDataSet ():
12+     return  [[1 ,3 ,4 ],[2 ,3 ,5 ],[1 ,2 ,3 ,5 ],[2 ,5 ]]
13+ 
14+ def  createC1 (dataSet ):
15+     C1  =  []   #C1为大小为1的项的集合 
16+     for  transaction  in  dataSet :  #遍历数据集中的每一条交易 
17+         for  item  in  transaction : #遍历每一条交易中的每个商品 
18+             if  not  [item ] in  C1 :
19+                 C1 .append ([item ])
20+     C1 .sort ()
21+     #map函数表示遍历C1中的每一个元素执行forzenset,frozenset表示“冰冻”的集合,即不可改变 
22+     return  map (frozenset ,C1 )
23+ 
24+ #Ck表示数据集,D表示候选集合的列表,minSupport表示最小支持度 
25+ #该函数用于从C1生成L1,L1表示满足最低支持度的元素集合 
26+ def  scanD (D ,Ck ,minSupport ):
27+     ssCnt  =  {}
28+     for  tid  in  D :
29+         for  can  in  Ck :
30+             #issubset:表示如果集合can中的每一元素都在tid中则返回true   
31+             if  can .issubset (tid ):
32+                 #统计各个集合scan出现的次数,存入ssCnt字典中,字典的key是集合,value是统计出现的次数 
33+                 if  not  ssCnt .has_key (can ):
34+                     ssCnt [can ] =  1 
35+                 else :
36+                     ssCnt [can ] +=  1 
37+     numItems  =  float (len (D ))
38+     retList  =  []
39+     supportData  =  {}
40+     for  key  in  ssCnt :
41+         #计算每个项集的支持度,如果满足条件则把该项集加入到retList列表中 
42+         support  =  ssCnt [key ]/ numItems 
43+         if  support  >=  minSupport :
44+             retList .insert (0 , key )
45+         #构建支持的项集的字典 
46+         supportData [key ] =  support 
47+     return  retList ,supportData 
48+ #====================                准备函数(上)              ============================= 
49+ 
50+ #======================          Apriori算法(下)               ================================= 
51+ #Create Ck,CaprioriGen ()的输人参数为频繁项集列表Lk与项集元素个数k,输出为Ck 
52+ def  aprioriGen (Lk ,k ):
53+     retList  =  []
54+     lenLk  =  len (Lk )
55+     for  i  in  range (lenLk ):
56+         for  j  in  range (i + 1 ,lenLk ):
57+             #前k-2项相同时合并两个集合 
58+             L1  =  list (Lk [i ])[:k - 2 ]
59+             L2  =  list (Lk [j ])[:k - 2 ]
60+             L1 .sort ()
61+             L2 .sort ()
62+             if  L1  ==  L2 :
63+                 retList .append (Lk [i ] |  Lk [j ])
64+             
65+     return  retList 
66+ 
67+ def  apriori (dataSet , minSupport = 0.5 ):
68+     C1  =  createC1 (dataSet )  #创建C1 
69+     #D: [set([1, 3, 4]), set([2, 3, 5]), set([1, 2, 3, 5]), set([2, 5])] 
70+     D  =  map (set ,dataSet )
71+     L1 ,supportData  =  scanD (D , C1 , minSupport )
72+     L  =  [L1 ]
73+     #若两个项集的长度为k - 1,则必须前k-2项相同才可连接,即求并集,所以[:k-2]的实际作用为取列表的前k-1个元素 
74+     k  =  2 
75+     while (len (L [k - 2 ]) >  0 ):
76+         Ck  =  aprioriGen (L [k - 2 ], k )
77+         Lk ,supK  =  scanD (D ,Ck , minSupport )
78+         supportData .update (supK )
79+         L .append (Lk )
80+         k  += 1 
81+     return  L ,supportData 
82+ #======================          Apriori算法(上)               ================================= 
83+ 
84+ 
85+ #========================            关联规则生成函数                     ======================== 
86+ #调用下边两个函数 
87+ #L:表示频繁项集列表,supportData:包含那些频繁项集支持数据的字典,minConf:表示最小可信度阀值 
88+ def  generateRules (L , supportData ,minConf  =  0.7 ):
89+     bigRuleList  =  [] #存放可信度,后面可以根据可信度排名 
90+     for  i  in  range (1 ,len (L )):
91+         for  freqSet  in  L [i ]:
92+             H1  =  [frozenset ([item ]) for  item  in  freqSet ]
93+             if  (i > 1 ):
94+                 #如果项集的元素数目超过2,则使用下面的函数对他进行下一步的合并,合并函数如下 
95+                 rulesFromConseq (freqSet , H1 , supportData , bigRuleList , minConf )
96+             else :
97+                 #如果项集中只有两个元素,则使用下面的函数计算可信度 
98+                 calcConf (freqSet ,H1 ,supportData ,bigRuleList ,minConf )    
99+             
100+     return  bigRuleList 
101+ 
102+ #第一次修改,出现丢失的那几个关联规则 
103+ def  generateRules2 (L , supportData , minConf = 0.7 ):
104+     bigRuleList  =  []
105+     for  i  in  range (1 , len (L )):
106+         for  freqSet  in  L [i ]:
107+             H1  =  [frozenset ([item ]) for  item  in  freqSet ]
108+             if  (i  >  1 ):
109+                 # 三个及以上元素的集合 
110+                 H1  =  calcConf (freqSet , H1 , supportData , bigRuleList , minConf )
111+                 rulesFromConseq (freqSet , H1 , supportData , bigRuleList , minConf )
112+             else :
113+                 # 两个元素的集合 
114+                 calcConf (freqSet , H1 , supportData , bigRuleList , minConf )
115+     return  bigRuleList 
116+ 
117+ #第二次修改,简化函数,和第一步修改结果相同 
118+ def  generateRules3 (L , supportData , minConf = 0.7 ):
119+     bigRuleList  =  []
120+     for  i  in  range (1 , len (L )):
121+         for  freqSet  in  L [i ]:
122+             H1  =  [frozenset ([item ]) for  item  in  freqSet ]
123+             rulesFromConseq2 (freqSet , H1 , supportData , bigRuleList , minConf )
124+     return  bigRuleList 
125+  
126+ def  rulesFromConseq2 (freqSet , H , supportData , brl , minConf = 0.7 ):
127+     m  =  len (H [0 ])
128+     if  (len (freqSet ) >  m ): # 判断长度改为 > m,这时即可以求H的可信度 
129+         Hmpl  =  calcConf (freqSet , H , supportData , brl , minConf )
130+         if  (len (Hmpl ) >  1 ): # 判断求完可信度后是否还有可信度大于阈值的项用来生成下一层H 
131+             Hmpl  =  aprioriGen (Hmpl , m  +  1 )
132+             rulesFromConseq2 (freqSet , Hmpl , supportData , brl , minConf ) # 递归计算,不变 
133+ 
134+ #第三次修改       消除rulesFromConseq2()函数中的递归项,去掉了多余的Hmpl变量,运行结果和上面相同 
135+ def  rulesFromConseq3 (freqSet , H , supportData , brl , minConf = 0.7 ):
136+     m  =  len (H [0 ])
137+     while  (len (freqSet ) >  m ): # 判断长度 > m,这时即可求H的可信度 
138+         H  =  calcConf (freqSet , H , supportData , brl , minConf )
139+         if  (len (H ) >  1 ): # 判断求完可信度后是否还有可信度大于阈值的项用来生成下一层H 
140+             H  =  aprioriGen (H , m  +  1 )
141+             m  +=  1 
142+         else : # 不能继续生成下一层候选关联规则,提前退出循环 
143+             break 
144+ 
145+ #计算规则的可信度,并找到满足最小可信度的规则存放在prunedH中,作为返回值返回 
146+ def  calcConf (freqSet ,H ,supportData , br1 , minConf = 0.7 ):
147+     prunedH  =  []
148+     for  conseq  in  H :
149+         conf  =  supportData [freqSet ]/ supportData [freqSet  -  conseq ]
150+       
151+         if  conf >=  minConf :
152+             print  freqSet - conseq ,"-->" ,conseq  ,"conf:" ,conf 
153+             br1 .append ((freqSet - conseq ,conseq ,conf ))  #填充可信度列表 
154+             prunedH .append (conseq )    #保存满足最小置信度的规则 
155+     return  prunedH 
156+ 
157+ #从最初的项集中产生更多的关联规则,H为当前的候选规则集,产生下一层的候选规则集 
158+ #freqSet:频繁项集 H:可以出现在规则右部的元素列表  supportData:保存项集的支持度,brl保存生成的关联规则,minConf:最小可信度阀值 
159+ def  rulesFromConseq (freqSet , H , supportData , br1 , minConf = 0.7 ):
160+     m  =  len (H [0 ])
161+     if  (len (freqSet ) > (m  + 1 )):
162+         Hmp1  =  aprioriGen ( H , m + 1 )
163+         Hmp1  =  calcConf (freqSet , Hmp1 , supportData , br1 , minConf )          
164+         if  (len (Hmp1 ) > 1 ):
165+             rulesFromConseq (freqSet , Hmp1 , supportData , br1 , minConf )
166+         
167+         
168+ if  __name__ == "__main__" :
169+     dataSet  =  loadDataSet ()
170+     L ,suppData  =  apriori (dataSet )
171+     i  =  0 
172+     for  one  in  L :
173+         print  "项数为 %s 的频繁项集:"  %  (i  +  1 ), one ,"\n " 
174+         i  += 1 
175+         
176+     print  "generateRules3:\n minConf=0.7时:" 
177+     rules  =  generateRules (L ,suppData , minConf = 0.7 )
178+     print  "\n minConf=0.5时:" 
179+     rules  =  generateRules (L ,suppData , minConf = 0.5 )
180+     
181+     print  "generateRules2:\n minConf=0.7时:" 
182+     rules  =  generateRules2 (L ,suppData , minConf = 0.7 )
183+     print  "minConf=0.5时:" 
184+     rules  =  generateRules2 (L ,suppData , minConf = 0.5 )
185+     
186+     
187+     print  "generateRules3:\n minConf=0.7时:" 
188+     rules  =  generateRules3 (L ,suppData , minConf = 0.7 )
189+     print  "minConf=0.5时:" 
190+     rules  =  generateRules3 (L ,suppData , minConf = 0.5 )
0 commit comments