Skip to content

Commit 413dbfc

Browse files
committed
基于朴素贝叶斯分类算法构建文本分类器的Python实现
1 parent 9aaa307 commit 413dbfc

File tree

1 file changed

+91
-0
lines changed

1 file changed

+91
-0
lines changed

Bayes/bayes.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#encoding:utf-8
2+
'''
3+
Created on 2016年5月12日
4+
5+
@author: Gamer Think
6+
'''
7+
8+
from numpy import *
9+
10+
#词表到向量的转换函数
11+
def loadDataSet():
12+
postingList = [['my','dog','has','flea','problems','help','please'],
13+
['maybe','not','take','him','to','dog','park','stupid'],
14+
['my','dalmation','is','so','cute','I','love','him'],
15+
['stop','posting','stupid','worthless','garbage'],
16+
['mr','licks','ate','my','steak','how','to','stop','him'],
17+
['quit','buying','worthless','dog','food','stupid']]
18+
classVec = [0,1,0,1,0,1] #1,侮辱 0,正常
19+
return postingList,classVec
20+
21+
def createVocabList(dataSet):
22+
vocabSet = set([]) #调用set方法,创建一个空集
23+
for document in dataSet:
24+
vocabSet = vocabSet | set(document) #创建两个集合的并集
25+
return list(vocabSet)
26+
27+
def setOfWords2Vec(vocabList,inputSet):
28+
returnVec = [0]*len(vocabList) #创建一个所含元素都为0的向量
29+
for word in inputSet:
30+
if word in vocabList:
31+
returnVec[vocabList.index(word)] = 1
32+
else:
33+
print "the word:%s is not in my Vocabulary" % word
34+
return returnVec
35+
36+
37+
def bagOfWords2VecMN(vocabList,inputSet):
38+
returnVec = [0]*len(vocabList) #创建一个所含元素都为0的向量
39+
for word in inputSet:
40+
if word in vocabList:
41+
returnVec[vocabList.index(word)] += 1
42+
return returnVec
43+
44+
45+
#朴素贝叶斯分类器训练集
46+
def trainNB0(trainMatrix,trainCategory): #传入参数为文档矩阵,每篇文档类别标签所构成的向量
47+
numTrainDocs = len(trainMatrix) #文档矩阵的长度
48+
numWords = len(trainMatrix[0]) #第一个文档的单词个数
49+
pAbusive = sum(trainCategory)/float(numTrainDocs) #任意文档属于侮辱性文档概率
50+
#p0Num = zeros(numWords);p1Num = zeros(numWords) #初始化两个矩阵,长度为numWords,内容值为0
51+
p0Num = ones(numWords);p1Num = ones(numWords) #初始化两个矩阵,长度为numWords,内容值为1
52+
#p0Denom = 0.0;p1Denom = 0.0 #初始化概率
53+
p0Denom = 2.0;p1Denom = 2.0
54+
for i in range(numTrainDocs):
55+
if trainCategory[i]==1:
56+
p1Num +=trainMatrix[i]
57+
p1Denom += sum(trainMatrix[i])
58+
else:
59+
p0Num +=trainMatrix[i]
60+
p0Denom += sum(trainMatrix[i])
61+
#p1Vect = p1Num/p1Denom #对每个元素做除法
62+
#p0Vect = p0Num/p0Denom
63+
p1Vect = log(p1Num/p1Denom)
64+
p0Vect = log(p0Num/p0Denom)
65+
return p0Vect,p1Vect,pAbusive
66+
67+
#朴素贝叶斯分类函数
68+
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
69+
p1 = sum(vec2Classify * p1Vec) + log(pClass1) #元素相乘
70+
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
71+
if p1>p0:
72+
return 1
73+
else:
74+
return 0
75+
76+
def testingNB():
77+
listOPosts,listClasses = loadDataSet() #产生文档矩阵和对应的标签
78+
myVocabList = createVocabList(listOPosts) #创建并集
79+
trainMat = [] #创建一个空的列表
80+
for postinDoc in listOPosts:
81+
trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) #使用词向量来填充trainMat列表
82+
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #训练函数
83+
testEntry = ['love','my','dalmation'] #测试文档列表
84+
thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵
85+
print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
86+
testEntry = ['stupid','garbage']
87+
thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵
88+
print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
89+
90+
if __name__=="__main__":
91+
testingNB()

0 commit comments

Comments
 (0)