1+ #encoding:utf-8
2+ '''
3+ Created on 2016年5月12日
4+
5+ @author: Gamer Think
6+ '''
7+
8+ from numpy import *
9+
10+ #词表到向量的转换函数
11+ def loadDataSet ():
12+ postingList = [['my' ,'dog' ,'has' ,'flea' ,'problems' ,'help' ,'please' ],
13+ ['maybe' ,'not' ,'take' ,'him' ,'to' ,'dog' ,'park' ,'stupid' ],
14+ ['my' ,'dalmation' ,'is' ,'so' ,'cute' ,'I' ,'love' ,'him' ],
15+ ['stop' ,'posting' ,'stupid' ,'worthless' ,'garbage' ],
16+ ['mr' ,'licks' ,'ate' ,'my' ,'steak' ,'how' ,'to' ,'stop' ,'him' ],
17+ ['quit' ,'buying' ,'worthless' ,'dog' ,'food' ,'stupid' ]]
18+ classVec = [0 ,1 ,0 ,1 ,0 ,1 ] #1,侮辱 0,正常
19+ return postingList ,classVec
20+
21+ def createVocabList (dataSet ):
22+ vocabSet = set ([]) #调用set方法,创建一个空集
23+ for document in dataSet :
24+ vocabSet = vocabSet | set (document ) #创建两个集合的并集
25+ return list (vocabSet )
26+
27+ def setOfWords2Vec (vocabList ,inputSet ):
28+ returnVec = [0 ]* len (vocabList ) #创建一个所含元素都为0的向量
29+ for word in inputSet :
30+ if word in vocabList :
31+ returnVec [vocabList .index (word )] = 1
32+ else :
33+ print "the word:%s is not in my Vocabulary" % word
34+ return returnVec
35+
36+
37+ def bagOfWords2VecMN (vocabList ,inputSet ):
38+ returnVec = [0 ]* len (vocabList ) #创建一个所含元素都为0的向量
39+ for word in inputSet :
40+ if word in vocabList :
41+ returnVec [vocabList .index (word )] += 1
42+ return returnVec
43+
44+
45+ #朴素贝叶斯分类器训练集
46+ def trainNB0 (trainMatrix ,trainCategory ): #传入参数为文档矩阵,每篇文档类别标签所构成的向量
47+ numTrainDocs = len (trainMatrix ) #文档矩阵的长度
48+ numWords = len (trainMatrix [0 ]) #第一个文档的单词个数
49+ pAbusive = sum (trainCategory )/ float (numTrainDocs ) #任意文档属于侮辱性文档概率
50+ #p0Num = zeros(numWords);p1Num = zeros(numWords) #初始化两个矩阵,长度为numWords,内容值为0
51+ p0Num = ones (numWords );p1Num = ones (numWords ) #初始化两个矩阵,长度为numWords,内容值为1
52+ #p0Denom = 0.0;p1Denom = 0.0 #初始化概率
53+ p0Denom = 2.0 ;p1Denom = 2.0
54+ for i in range (numTrainDocs ):
55+ if trainCategory [i ]== 1 :
56+ p1Num += trainMatrix [i ]
57+ p1Denom += sum (trainMatrix [i ])
58+ else :
59+ p0Num += trainMatrix [i ]
60+ p0Denom += sum (trainMatrix [i ])
61+ #p1Vect = p1Num/p1Denom #对每个元素做除法
62+ #p0Vect = p0Num/p0Denom
63+ p1Vect = log (p1Num / p1Denom )
64+ p0Vect = log (p0Num / p0Denom )
65+ return p0Vect ,p1Vect ,pAbusive
66+
67+ #朴素贝叶斯分类函数
68+ def classifyNB (vec2Classify ,p0Vec ,p1Vec ,pClass1 ):
69+ p1 = sum (vec2Classify * p1Vec ) + log (pClass1 ) #元素相乘
70+ p0 = sum (vec2Classify * p0Vec ) + log (1.0 - pClass1 )
71+ if p1 > p0 :
72+ return 1
73+ else :
74+ return 0
75+
76+ def testingNB ():
77+ listOPosts ,listClasses = loadDataSet () #产生文档矩阵和对应的标签
78+ myVocabList = createVocabList (listOPosts ) #创建并集
79+ trainMat = [] #创建一个空的列表
80+ for postinDoc in listOPosts :
81+ trainMat .append (setOfWords2Vec (myVocabList ,postinDoc )) #使用词向量来填充trainMat列表
82+ p0V ,p1V ,pAb = trainNB0 (array (trainMat ),array (listClasses )) #训练函数
83+ testEntry = ['love' ,'my' ,'dalmation' ] #测试文档列表
84+ thisDoc = array (setOfWords2Vec (myVocabList ,testEntry )) #声明矩阵
85+ print testEntry ,'classified as:' ,classifyNB (thisDoc ,p0V ,p1V ,pAb )
86+ testEntry = ['stupid' ,'garbage' ]
87+ thisDoc = array (setOfWords2Vec (myVocabList ,testEntry )) #声明矩阵
88+ print testEntry ,'classified as:' ,classifyNB (thisDoc ,p0V ,p1V ,pAb )
89+
90+ if __name__ == "__main__" :
91+ testingNB ()
0 commit comments