Skip to content

Commit e7e02e0

Browse files
committed
SVM-spam
1 parent 1de461d commit e7e02e0

File tree

2 files changed

+119
-0
lines changed

2 files changed

+119
-0
lines changed

venv/src/SVMs/GaussianKernel2.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ def svmPredict(X, y, Xval, yval):
4747
best_score = score
4848
c, sigma = C, sig
4949

50+
print(c, sigma)
51+
5052
return c, sigma
5153

5254
if __name__ == '__main__':
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import re
2+
import nltk
3+
import numpy as np
4+
import pandas as pd
5+
from sklearn import svm
6+
import scipy.optimize as opt
7+
from scipy.io import loadmat
8+
import matplotlib.pyplot as plot
9+
from nltk.stem.porter import PorterStemmer
10+
11+
'''
12+
Lower-casing:
13+
Stripping HTML:
14+
Normalizing URLs:
15+
Normalizing Email Addresses:
16+
Normalizing Numbers:
17+
Normalizing Dollars:
18+
Word Stemming:
19+
Removal of non-words:
20+
'''
21+
22+
def getDataSet():
23+
# linux下
24+
path = '/home/y_labor/ml/machine-learning-ex6/ex6/vocab.txt'
25+
voc_list = pd.read_csv(path, sep='\t', header=None, names=['num', 'words'])
26+
voc_list = dict(zip(voc_list['words'], voc_list['num']))
27+
spamTest = loadmat('/home/y_labor/ml/machine-learning-ex6/ex6/spamTest.mat')
28+
spamTrain = loadmat('/home/y_labor/ml/machine-learning-ex6/ex6/spamTrain.mat')
29+
# print(spamTest.keys(), spamTrain.keys())
30+
31+
# windows下
32+
# path = 'C:\\Users\ydf_m\Desktop\machinelearning\machine-learning-ex6/ex6/vocab.txt'
33+
# voc_list = pd.read_csv(path, header=None, names=['num', 'words'])
34+
# spamTest = loadmat('C:\\Users\ydf_m\Desktop\machinelearning\machine-learning-ex6/ex6/spamTest.mat')
35+
# spamTrain = loadmat('C:\\Users\ydf_m\Desktop\machinelearning\machine-learning-ex6/ex6/spamTrain.mat')
36+
37+
Xtest = spamTest['Xtest']
38+
ytest = spamTest['ytest']
39+
X = spamTrain['X']
40+
y = spamTrain['y']
41+
42+
43+
return voc_list, Xtest, ytest, X, y
44+
45+
def getexample():
46+
example = []
47+
with open('/home/y_labor/ml/machine-learning-ex6/ex6/emailSample1.txt') as f:
48+
example.append(f.read())
49+
with open('/home/y_labor/ml/machine-learning-ex6/ex6/emailSample2.txt') as f:
50+
example.append(f.read())
51+
with open('/home/y_labor/ml/machine-learning-ex6/ex6/spamSample1.txt') as f:
52+
example.append(f.read())
53+
with open('/home/y_labor/ml/machine-learning-ex6/ex6/spamSample2.txt') as f:
54+
example.append(f.read())
55+
56+
return example
57+
58+
def processEmail(email):
59+
email = email.lower()
60+
email = re.sub(r'(<.*)?>', '', email)
61+
email = re.sub(r'(https?://)?www.*?[/|\s]', 'httpaddr', email)
62+
email = re.sub(r'[\w\d]+([._-][\w\d]+)@.+.(com|org|net)', 'emailaddr', email)
63+
email = re.sub(r'[\d]+', 'number', email)
64+
email = re.sub(r'[$]+', 'dollar', email)
65+
email = re.sub(r'[@$/#.-:&*+=[\]?!(){\},\'">_<;%]+', ' ', email)
66+
email = re.sub(r'[\t\n\s]+', ' ', email)
67+
email = nltk.word_tokenize(email)
68+
porter = PorterStemmer()
69+
email = [porter.stem(w) for w in email]
70+
71+
return email
72+
73+
def word_indices(email, voc_list):
74+
indices = []
75+
for word in email:
76+
if word in voc_list:
77+
indices.append(voc_list[word])
78+
79+
return indices
80+
81+
def emailFeatures(voc_list, indices):
82+
feature = np.zeros(len(voc_list))
83+
for i in indices:
84+
feature[i] = 1
85+
print('feature vector had length {} and {} non-zero entries'.format(len(feature), sum(feature)))
86+
87+
return feature
88+
89+
def trainsvm(X, y, Xtest, ytest, c):
90+
clf = svm.SVC(C=c, kernel='linear', gamma='auto')
91+
clf.fit(X, y.flatten())
92+
93+
predTrain = clf.score(X, y)
94+
predTest = clf.score(Xtest, ytest)
95+
96+
print('the classifier gets a training accuracy of about {:.2%} and a test accuracy of about {:.2%}'.format(predTrain, predTest))
97+
98+
return clf
99+
100+
def predict(example, clf):
101+
for email in example:
102+
email = processEmail(email)
103+
indices = word_indices(email, voc_list)
104+
feature = emailFeatures(voc_list, indices)
105+
feature = feature.reshape(1, -1)
106+
result = clf.predict(feature)
107+
if result == 0:
108+
print('non-spam')
109+
else:
110+
print('is spam')
111+
112+
if __name__ == '__main__':
113+
voc_list, Xtest, ytest, X, y = getDataSet()
114+
clf = trainsvm(X, y, Xtest, ytest, 0.5)
115+
116+
example = getexample()
117+
predict(example, clf)

0 commit comments

Comments
 (0)