1+ import re
2+ import nltk
3+ import numpy as np
4+ import pandas as pd
5+ from sklearn import svm
6+ import scipy .optimize as opt
7+ from scipy .io import loadmat
8+ import matplotlib .pyplot as plot
9+ from nltk .stem .porter import PorterStemmer
10+
11+ '''
12+ Lower-casing:
13+ Stripping HTML:
14+ Normalizing URLs:
15+ Normalizing Email Addresses:
16+ Normalizing Numbers:
17+ Normalizing Dollars:
18+ Word Stemming:
19+ Removal of non-words:
20+ '''
21+
22+ def getDataSet ():
23+ # linux下
24+ path = '/home/y_labor/ml/machine-learning-ex6/ex6/vocab.txt'
25+ voc_list = pd .read_csv (path , sep = '\t ' , header = None , names = ['num' , 'words' ])
26+ voc_list = dict (zip (voc_list ['words' ], voc_list ['num' ]))
27+ spamTest = loadmat ('/home/y_labor/ml/machine-learning-ex6/ex6/spamTest.mat' )
28+ spamTrain = loadmat ('/home/y_labor/ml/machine-learning-ex6/ex6/spamTrain.mat' )
29+ # print(spamTest.keys(), spamTrain.keys())
30+
31+ # windows下
32+ # path = 'C:\\Users\ydf_m\Desktop\machinelearning\machine-learning-ex6/ex6/vocab.txt'
33+ # voc_list = pd.read_csv(path, header=None, names=['num', 'words'])
34+ # spamTest = loadmat('C:\\Users\ydf_m\Desktop\machinelearning\machine-learning-ex6/ex6/spamTest.mat')
35+ # spamTrain = loadmat('C:\\Users\ydf_m\Desktop\machinelearning\machine-learning-ex6/ex6/spamTrain.mat')
36+
37+ Xtest = spamTest ['Xtest' ]
38+ ytest = spamTest ['ytest' ]
39+ X = spamTrain ['X' ]
40+ y = spamTrain ['y' ]
41+
42+
43+ return voc_list , Xtest , ytest , X , y
44+
45+ def getexample ():
46+ example = []
47+ with open ('/home/y_labor/ml/machine-learning-ex6/ex6/emailSample1.txt' ) as f :
48+ example .append (f .read ())
49+ with open ('/home/y_labor/ml/machine-learning-ex6/ex6/emailSample2.txt' ) as f :
50+ example .append (f .read ())
51+ with open ('/home/y_labor/ml/machine-learning-ex6/ex6/spamSample1.txt' ) as f :
52+ example .append (f .read ())
53+ with open ('/home/y_labor/ml/machine-learning-ex6/ex6/spamSample2.txt' ) as f :
54+ example .append (f .read ())
55+
56+ return example
57+
58+ def processEmail (email ):
59+ email = email .lower ()
60+ email = re .sub (r'(<.*)?>' , '' , email )
61+ email = re .sub (r'(https?://)?www.*?[/|\s]' , 'httpaddr' , email )
62+ email = re .sub (r'[\w\d]+([._-][\w\d]+)@.+.(com|org|net)' , 'emailaddr' , email )
63+ email = re .sub (r'[\d]+' , 'number' , email )
64+ email = re .sub (r'[$]+' , 'dollar' , email )
65+ email = re .sub (r'[@$/#.-:&*+=[\]?!(){\},\'">_<;%]+' , ' ' , email )
66+ email = re .sub (r'[\t\n\s]+' , ' ' , email )
67+ email = nltk .word_tokenize (email )
68+ porter = PorterStemmer ()
69+ email = [porter .stem (w ) for w in email ]
70+
71+ return email
72+
73+ def word_indices (email , voc_list ):
74+ indices = []
75+ for word in email :
76+ if word in voc_list :
77+ indices .append (voc_list [word ])
78+
79+ return indices
80+
81+ def emailFeatures (voc_list , indices ):
82+ feature = np .zeros (len (voc_list ))
83+ for i in indices :
84+ feature [i ] = 1
85+ print ('feature vector had length {} and {} non-zero entries' .format (len (feature ), sum (feature )))
86+
87+ return feature
88+
89+ def trainsvm (X , y , Xtest , ytest , c ):
90+ clf = svm .SVC (C = c , kernel = 'linear' , gamma = 'auto' )
91+ clf .fit (X , y .flatten ())
92+
93+ predTrain = clf .score (X , y )
94+ predTest = clf .score (Xtest , ytest )
95+
96+ print ('the classifier gets a training accuracy of about {:.2%} and a test accuracy of about {:.2%}' .format (predTrain , predTest ))
97+
98+ return clf
99+
100+ def predict (example , clf ):
101+ for email in example :
102+ email = processEmail (email )
103+ indices = word_indices (email , voc_list )
104+ feature = emailFeatures (voc_list , indices )
105+ feature = feature .reshape (1 , - 1 )
106+ result = clf .predict (feature )
107+ if result == 0 :
108+ print ('non-spam' )
109+ else :
110+ print ('is spam' )
111+
112+ if __name__ == '__main__' :
113+ voc_list , Xtest , ytest , X , y = getDataSet ()
114+ clf = trainsvm (X , y , Xtest , ytest , 0.5 )
115+
116+ example = getexample ()
117+ predict (example , clf )
0 commit comments