1+ #encoding:utf-8 
2+ ''' 
3+ Created on 2016年5月12日 
4+ 
5+ @author: Gamer Think 
6+ ''' 
7+ 
8+ from  numpy  import  * 
9+ 
10+ #词表到向量的转换函数 
11+ def  loadDataSet ():
12+     postingList  =  [['my' ,'dog' ,'has' ,'flea' ,'problems' ,'help' ,'please' ],
13+                    ['maybe' ,'not' ,'take' ,'him' ,'to' ,'dog' ,'park' ,'stupid' ],
14+                    ['my' ,'dalmation' ,'is' ,'so' ,'cute' ,'I' ,'love' ,'him' ],
15+                    ['stop' ,'posting' ,'stupid' ,'worthless' ,'garbage' ],
16+                    ['mr' ,'licks' ,'ate' ,'my' ,'steak' ,'how' ,'to' ,'stop' ,'him' ],
17+                    ['quit' ,'buying' ,'worthless' ,'dog' ,'food' ,'stupid' ]]
18+     classVec  =  [0 ,1 ,0 ,1 ,0 ,1 ]      #1,侮辱  0,正常 
19+     return  postingList ,classVec 
20+ 
21+ def  createVocabList (dataSet ):
22+     vocabSet  =  set ([])  #调用set方法,创建一个空集 
23+     for  document  in  dataSet :
24+         vocabSet  =  vocabSet  |  set (document )     #创建两个集合的并集 
25+     return  list (vocabSet )
26+ 
27+ def  setOfWords2Vec (vocabList ,inputSet ):
28+     returnVec  =  [0 ]* len (vocabList )   #创建一个所含元素都为0的向量 
29+     for  word  in  inputSet :
30+         if  word  in  vocabList :
31+             returnVec [vocabList .index (word )] =  1 
32+         else :
33+             print  "the word:%s is not in my Vocabulary"  %  word 
34+     return  returnVec 
35+ 
36+ 
37+ def  bagOfWords2VecMN (vocabList ,inputSet ):
38+     returnVec  =  [0 ]* len (vocabList )   #创建一个所含元素都为0的向量 
39+     for  word  in  inputSet :
40+         if  word  in  vocabList :
41+             returnVec [vocabList .index (word )] +=  1 
42+     return  returnVec 
43+ 
44+ 
45+ #朴素贝叶斯分类器训练集 
46+ def  trainNB0 (trainMatrix ,trainCategory ):  #传入参数为文档矩阵,每篇文档类别标签所构成的向量 
47+     numTrainDocs  =  len (trainMatrix )      #文档矩阵的长度 
48+     numWords  =  len (trainMatrix [0 ])       #第一个文档的单词个数 
49+     pAbusive  =  sum (trainCategory )/ float (numTrainDocs )  #任意文档属于侮辱性文档概率 
50+     #p0Num = zeros(numWords);p1Num = zeros(numWords)        #初始化两个矩阵,长度为numWords,内容值为0 
51+     p0Num  =  ones (numWords );p1Num  =  ones (numWords )        #初始化两个矩阵,长度为numWords,内容值为1 
52+     #p0Denom = 0.0;p1Denom = 0.0                         #初始化概率 
53+     p0Denom  =  2.0 ;p1Denom  =  2.0  
54+     for  i  in  range (numTrainDocs ):
55+         if  trainCategory [i ]== 1 :
56+             p1Num  += trainMatrix [i ]
57+             p1Denom  +=  sum (trainMatrix [i ])
58+         else :
59+             p0Num  += trainMatrix [i ]
60+             p0Denom  +=  sum (trainMatrix [i ])
61+     #p1Vect = p1Num/p1Denom #对每个元素做除法 
62+     #p0Vect = p0Num/p0Denom 
63+     p1Vect  =  log (p1Num / p1Denom )
64+     p0Vect  =  log (p0Num / p0Denom )
65+     return  p0Vect ,p1Vect ,pAbusive 
66+ 
67+ #朴素贝叶斯分类函数 
68+ def  classifyNB (vec2Classify ,p0Vec ,p1Vec ,pClass1 ):
69+     p1  =  sum (vec2Classify  *  p1Vec ) +  log (pClass1 )   #元素相乘 
70+     p0  =  sum (vec2Classify  *  p0Vec ) +  log (1.0  -  pClass1 )
71+     if  p1 > p0 :
72+         return  1 
73+     else :
74+         return  0 
75+ 
76+ def  testingNB ():
77+     listOPosts ,listClasses  =  loadDataSet ()   #产生文档矩阵和对应的标签 
78+     myVocabList  =  createVocabList (listOPosts ) #创建并集 
79+     trainMat  =  []   #创建一个空的列表 
80+     for  postinDoc  in  listOPosts :
81+         trainMat .append (setOfWords2Vec (myVocabList ,postinDoc ))  #使用词向量来填充trainMat列表 
82+     p0V ,p1V ,pAb  =  trainNB0 (array (trainMat ),array (listClasses ))  #训练函数 
83+     testEntry  =  ['love' ,'my' ,'dalmation' ]   #测试文档列表 
84+     thisDoc  =  array (setOfWords2Vec (myVocabList ,testEntry )) #声明矩阵 
85+     print  testEntry ,'classified as:' ,classifyNB (thisDoc ,p0V ,p1V ,pAb )
86+     testEntry  =  ['stupid' ,'garbage' ]
87+     thisDoc  =  array (setOfWords2Vec (myVocabList ,testEntry ))    #声明矩阵 
88+     print  testEntry ,'classified as:' ,classifyNB (thisDoc ,p0V ,p1V ,pAb )
89+     
90+ if  __name__ == "__main__" :
91+     testingNB ()
0 commit comments