Skip to content

Commit b1cfed1

Browse files
committed
add
1 parent 524da40 commit b1cfed1

24 files changed

+468
-402
lines changed

AdaBoost/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
@Time : 2017/12/12 11:47
4+
@Author : Elvis
5+
"""
6+
"""
7+
__init__.py.py
8+
9+
"""

AdaBoost/adaboost.py

Lines changed: 78 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@
1212
import numpy as np
1313
import pandas as pd
1414

15-
from sklearn.cross_validation import train_test_split
15+
from sklearn.model_selection import train_test_split
1616
from sklearn.metrics import accuracy_score
1717

1818
sign_time_count = 0
1919

20+
2021
class Sign(object):
2122
'''
2223
阈值分类器
@@ -29,14 +30,14 @@ class Sign(object):
2930
因为是针对已经二值化后的MNIST数据集,所以v的取值只有3个 {0,1,2}
3031
'''
3132

32-
def __init__(self,features,labels,w):
33-
self.X = features # 训练数据特征
34-
self.Y = labels # 训练数据的标签
35-
self.N = len(labels) # 训练数据大小
33+
def __init__(self, features, labels, w):
34+
self.X = features # 训练数据特征
35+
self.Y = labels # 训练数据的标签
36+
self.N = len(labels) # 训练数据大小
3637

37-
self.w = w # 训练数据权值分布
38+
self.w = w # 训练数据权值分布
3839

39-
self.indexes = [0,1,2] # 阈值轴可选范围
40+
self.indexes = [0, 1, 2] # 阈值轴可选范围
4041

4142
def _train_less_than_(self):
4243
'''
@@ -48,21 +49,19 @@ def _train_less_than_(self):
4849

4950
for i in self.indexes:
5051
score = 0
51-
for j in xrange(self.N):
52+
for j in range(self.N):
5253
val = -1
53-
if self.X[j]<i:
54+
if self.X[j] < i:
5455
val = 1
5556

56-
if val*self.Y[j]<0:
57+
if val * self.Y[j] < 0:
5758
score += self.w[j]
5859

5960
if score < error_score:
6061
index = i
6162
error_score = score
6263

63-
return index,error_score
64-
65-
64+
return index, error_score
6665

6766
def _train_more_than_(self):
6867
'''
@@ -74,27 +73,27 @@ def _train_more_than_(self):
7473

7574
for i in self.indexes:
7675
score = 0
77-
for j in xrange(self.N):
76+
for j in range(self.N):
7877
val = 1
79-
if self.X[j]<i:
78+
if self.X[j] < i:
8079
val = -1
8180

82-
if val*self.Y[j]<0:
81+
if val * self.Y[j] < 0:
8382
score += self.w[j]
8483

8584
if score < error_score:
8685
index = i
8786
error_score = score
8887

89-
return index,error_score
88+
return index, error_score
9089

9190
def train(self):
9291
global sign_time_count
9392
time1 = time.time()
94-
less_index,less_score = self._train_less_than_()
95-
more_index,more_score = self._train_more_than_()
93+
less_index, less_score = self._train_less_than_()
94+
more_index, more_score = self._train_more_than_()
9695
time2 = time.time()
97-
sign_time_count += time2-time1
96+
sign_time_count += time2 - time1
9897

9998
if less_score < more_score:
10099
self.is_less = True
@@ -106,179 +105,179 @@ def train(self):
106105
self.index = more_index
107106
return more_score
108107

109-
def predict(self,feature):
110-
if self.is_less>0:
111-
if feature<self.index:
108+
def predict(self, feature):
109+
if self.is_less > 0:
110+
if feature < self.index:
112111
return 1.0
113112
else:
114113
return -1.0
115114
else:
116-
if feature<self.index:
115+
if feature < self.index:
117116
return -1.0
118117
else:
119118
return 1.0
120119

121120

122121
class AdaBoost(object):
123-
124122
def __init__(self):
125123
pass
126124

127-
def _init_parameters_(self,features,labels):
128-
self.X = features # 训练集特征
129-
self.Y = labels # 训练集标签
125+
def _init_parameters_(self, features, labels):
126+
self.X = features # 训练集特征
127+
self.Y = labels # 训练集标签
130128

131-
self.n = len(features[0]) # 特征维度
132-
self.N = len(features) # 训练集大小
133-
self.M = 10 # 分类器数目
129+
self.n = len(features[0]) # 特征维度
130+
self.N = len(features) # 训练集大小
131+
self.M = 10 # 分类器数目
134132

135-
self.w = [1.0/self.N]*self.N # 训练集的权值分布
136-
self.alpha = [] # 分类器系数 公式8.2
137-
self.classifier = [] # (维度,分类器),针对当前维度的分类器
133+
self.w = [1.0 / self.N] * self.N # 训练集的权值分布
134+
self.alpha = [] # 分类器系数 公式8.2
135+
self.classifier = [] # (维度,分类器),针对当前维度的分类器
138136

139-
def _w_(self,index,classifier,i):
137+
def _w_(self, index, classifier, i):
140138
'''
141139
公式8.4不算Zm
142140
'''
143141

144-
return self.w[i]*math.exp(-self.alpha[-1]*self.Y[i]*classifier.predict(self.X[i][index]))
142+
return self.w[i] * math.exp(-self.alpha[-1] * self.Y[i] * classifier.predict(self.X[i][index]))
145143

146-
def _Z_(self,index,classifier):
144+
def _Z_(self, index, classifier):
147145
'''
148146
公式8.5
149147
'''
150148

151149
Z = 0
152150

153-
for i in xrange(self.N):
154-
Z += self._w_(index,classifier,i)
151+
for i in range(self.N):
152+
Z += self._w_(index, classifier, i)
155153

156154
return Z
157155

158-
def train(self,features,labels):
156+
def train(self, features, labels):
159157

160-
self._init_parameters_(features,labels)
158+
self._init_parameters_(features, labels)
161159

162-
for times in xrange(self.M):
160+
for times in range(self.M):
163161
logging.debug('iterater %d' % times)
164162

165163
time1 = time.time()
166164
map_time = 0
167165

168-
best_classifier = (100000,None,None) #(误差率,针对的特征,分类器)
169-
for i in xrange(self.n):
166+
best_classifier = (100000, None, None) # (误差率,针对的特征,分类器)
167+
for i in range(self.n):
170168
map_time -= time.time()
171-
features = map(lambda x:x[i],self.X)
169+
features = map(lambda x: x[i], self.X)
172170
map_time += time.time()
173-
classifier = Sign(features,self.Y,self.w)
171+
classifier = Sign(features, self.Y, self.w)
174172
error_score = classifier.train()
175173

176174
if error_score < best_classifier[0]:
177-
best_classifier = (error_score,i,classifier)
175+
best_classifier = (error_score, i, classifier)
178176

179177
em = best_classifier[0]
180178

181179
# 分析用,之后删除 开始
182-
print 'em is %s, index is %d' % (str(em),best_classifier[1])
180+
print('em is %s, index is %d' % (str(em), best_classifier[1]))
183181
time2 = time.time()
184182
global sign_time_count
185-
print '总运行时间:%s, 那两段关键代码运行时间:%s, map的时间是:%s' % (str(time2-time1),str(sign_time_count),str(map_time))
183+
print('总运行时间:%s, 那两段关键代码运行时间:%s, map的时间是:%s' % (str(time2 - time1), str(sign_time_count), str(map_time)))
186184
sign_time_count = 0
187185
# 分析用,之后删除 结束
188186

189-
if em==0:
187+
if em == 0:
190188
self.alpha.append(100)
191189
else:
192-
self.alpha.append(0.5*math.log((1-em)/em))
190+
self.alpha.append(0.5 * math.log((1 - em) / em))
193191

194192
self.classifier.append(best_classifier[1:])
195193

196-
Z = self._Z_(best_classifier[1],best_classifier[2])
194+
Z = self._Z_(best_classifier[1], best_classifier[2])
197195

198196
# 计算训练集权值分布 8.4
199-
for i in xrange(self.N):
200-
self.w[i] = self._w_(best_classifier[1],best_classifier[2],i)/Z
197+
for i in range(self.N):
198+
self.w[i] = self._w_(best_classifier[1], best_classifier[2], i) / Z
201199

202-
def _predict_(self,feature):
200+
def _predict_(self, feature):
203201

204202
result = 0.0
205-
for i in xrange(self.M):
203+
for i in range(self.M):
206204
index = self.classifier[i][0]
207205
classifier = self.classifier[i][1]
208206

209-
result += self.alpha[i]*classifier.predict(feature[index])
207+
result += self.alpha[i] * classifier.predict(feature[index])
210208

211-
if result>0:
209+
if result > 0:
212210
return 1
213211
return -1
214212

215-
216-
217-
def predict(self,features):
213+
def predict(self, features):
218214
results = []
219215

220216
for feature in features:
221217
results.append(self._predict_(feature))
222218

223219
return results
224220

221+
225222
# 二值化
226223
def binaryzation(img):
227224
cv_img = img.astype(np.uint8)
228-
cv2.threshold(cv_img,50,1,cv2.cv.CV_THRESH_BINARY_INV,cv_img)
225+
cv2.threshold(cv_img, 50, 1, cv2.cv.CV_THRESH_BINARY_INV, cv_img)
229226
return cv_img
230227

228+
231229
def binaryzation_features(trainset):
232230
features = []
233231

234232
for img in trainset:
235-
img = np.reshape(img,(28,28))
233+
img = np.reshape(img, (28, 28))
236234
cv_img = img.astype(np.uint8)
237235

238236
img_b = binaryzation(cv_img)
239237
# hog_feature = np.transpose(hog_feature)
240238
features.append(img_b)
241239

242240
features = np.array(features)
243-
features = np.reshape(features,(-1,784))
241+
features = np.reshape(features, (-1, 784))
244242

245243
return features
246244

245+
247246
if __name__ == '__main__':
248247
logger = logging.getLogger()
249248
logger.setLevel(logging.DEBUG)
250249

251-
print 'Start read data'
250+
print('Start read data')
252251

253252
time_1 = time.time()
254253

255-
raw_data = pd.read_csv('../data/train_binary.csv',header=0)
254+
raw_data = pd.read_csv('../data/train_binary.csv', header=0)
256255
data = raw_data.values
257256

258-
imgs = data[0::,1::]
259-
labels = data[::,0]
260-
257+
imgs = data[0::, 1::]
258+
labels = data[::, 0]
261259

262260
# 选取 2/3 数据作为训练集, 1/3 数据作为测试集
263261
features = binaryzation_features(imgs)
264-
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.5, random_state=0)
262+
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.5,
263+
random_state=0)
265264

266265
time_2 = time.time()
267-
print 'read data cost ',time_2 - time_1,' second','\n'
266+
print('read data cost ', time_2 - time_1, ' second', '\n')
268267

269-
print 'Start training'
270-
train_labels = map(lambda x:2*x-1,train_labels)
268+
print('Start training')
269+
train_labels = map(lambda x: 2 * x - 1, train_labels)
271270
ada = AdaBoost()
272271
ada.train(train_features, train_labels)
273272

274273
time_3 = time.time()
275-
print 'training cost ',time_3 - time_2,' second','\n'
274+
print('training cost ', time_3 - time_2, ' second', '\n')
276275

277-
print 'Start predicting'
276+
print('Start predicting')
278277
test_predict = ada.predict(test_features)
279278
time_4 = time.time()
280-
print 'predicting cost ',time_4 - time_3,' second','\n'
279+
print('predicting cost ', time_4 - time_3, ' second', '\n')
281280

282-
test_labels = map(lambda x:2*x-1,test_labels)
283-
score = accuracy_score(test_labels,test_predict)
284-
print "The accruacy socre is ", score
281+
test_labels = map(lambda x: 2 * x - 1, test_labels)
282+
score = accuracy_score(test_labels, test_predict)
283+
print("The accruacy socre is ", score)

0 commit comments

Comments
 (0)