1212import numpy as np
1313import pandas as pd
1414
15- from sklearn .cross_validation import train_test_split
15+ from sklearn .model_selection import train_test_split
1616from sklearn .metrics import accuracy_score
1717
1818sign_time_count = 0
1919
20+
2021class Sign (object ):
2122 '''
2223 阈值分类器
@@ -29,14 +30,14 @@ class Sign(object):
2930 因为是针对已经二值化后的MNIST数据集,所以v的取值只有3个 {0,1,2}
3031 '''
3132
32- def __init__ (self ,features ,labels ,w ):
33- self .X = features # 训练数据特征
34- self .Y = labels # 训练数据的标签
35- self .N = len (labels ) # 训练数据大小
33+ def __init__ (self , features , labels , w ):
34+ self .X = features # 训练数据特征
35+ self .Y = labels # 训练数据的标签
36+ self .N = len (labels ) # 训练数据大小
3637
37- self .w = w # 训练数据权值分布
38+ self .w = w # 训练数据权值分布
3839
39- self .indexes = [0 ,1 , 2 ] # 阈值轴可选范围
40+ self .indexes = [0 , 1 , 2 ] # 阈值轴可选范围
4041
4142 def _train_less_than_ (self ):
4243 '''
@@ -48,21 +49,19 @@ def _train_less_than_(self):
4849
4950 for i in self .indexes :
5051 score = 0
51- for j in xrange (self .N ):
52+ for j in range (self .N ):
5253 val = - 1
53- if self .X [j ]< i :
54+ if self .X [j ] < i :
5455 val = 1
5556
56- if val * self .Y [j ]< 0 :
57+ if val * self .Y [j ] < 0 :
5758 score += self .w [j ]
5859
5960 if score < error_score :
6061 index = i
6162 error_score = score
6263
63- return index ,error_score
64-
65-
64+ return index , error_score
6665
6766 def _train_more_than_ (self ):
6867 '''
@@ -74,27 +73,27 @@ def _train_more_than_(self):
7473
7574 for i in self .indexes :
7675 score = 0
77- for j in xrange (self .N ):
76+ for j in range (self .N ):
7877 val = 1
79- if self .X [j ]< i :
78+ if self .X [j ] < i :
8079 val = - 1
8180
82- if val * self .Y [j ]< 0 :
81+ if val * self .Y [j ] < 0 :
8382 score += self .w [j ]
8483
8584 if score < error_score :
8685 index = i
8786 error_score = score
8887
89- return index ,error_score
88+ return index , error_score
9089
9190 def train (self ):
9291 global sign_time_count
9392 time1 = time .time ()
94- less_index ,less_score = self ._train_less_than_ ()
95- more_index ,more_score = self ._train_more_than_ ()
93+ less_index , less_score = self ._train_less_than_ ()
94+ more_index , more_score = self ._train_more_than_ ()
9695 time2 = time .time ()
97- sign_time_count += time2 - time1
96+ sign_time_count += time2 - time1
9897
9998 if less_score < more_score :
10099 self .is_less = True
@@ -106,179 +105,179 @@ def train(self):
106105 self .index = more_index
107106 return more_score
108107
109- def predict (self ,feature ):
110- if self .is_less > 0 :
111- if feature < self .index :
108+ def predict (self , feature ):
109+ if self .is_less > 0 :
110+ if feature < self .index :
112111 return 1.0
113112 else :
114113 return - 1.0
115114 else :
116- if feature < self .index :
115+ if feature < self .index :
117116 return - 1.0
118117 else :
119118 return 1.0
120119
121120
122121class AdaBoost (object ):
123-
124122 def __init__ (self ):
125123 pass
126124
127- def _init_parameters_ (self ,features ,labels ):
128- self .X = features # 训练集特征
129- self .Y = labels # 训练集标签
125+ def _init_parameters_ (self , features , labels ):
126+ self .X = features # 训练集特征
127+ self .Y = labels # 训练集标签
130128
131- self .n = len (features [0 ]) # 特征维度
132- self .N = len (features ) # 训练集大小
133- self .M = 10 # 分类器数目
129+ self .n = len (features [0 ]) # 特征维度
130+ self .N = len (features ) # 训练集大小
131+ self .M = 10 # 分类器数目
134132
135- self .w = [1.0 / self .N ]* self .N # 训练集的权值分布
136- self .alpha = [] # 分类器系数 公式8.2
137- self .classifier = [] # (维度,分类器),针对当前维度的分类器
133+ self .w = [1.0 / self .N ] * self .N # 训练集的权值分布
134+ self .alpha = [] # 分类器系数 公式8.2
135+ self .classifier = [] # (维度,分类器),针对当前维度的分类器
138136
139- def _w_ (self ,index ,classifier ,i ):
137+ def _w_ (self , index , classifier , i ):
140138 '''
141139 公式8.4不算Zm
142140 '''
143141
144- return self .w [i ]* math .exp (- self .alpha [- 1 ]* self .Y [i ]* classifier .predict (self .X [i ][index ]))
142+ return self .w [i ] * math .exp (- self .alpha [- 1 ] * self .Y [i ] * classifier .predict (self .X [i ][index ]))
145143
146- def _Z_ (self ,index ,classifier ):
144+ def _Z_ (self , index , classifier ):
147145 '''
148146 公式8.5
149147 '''
150148
151149 Z = 0
152150
153- for i in xrange (self .N ):
154- Z += self ._w_ (index ,classifier ,i )
151+ for i in range (self .N ):
152+ Z += self ._w_ (index , classifier , i )
155153
156154 return Z
157155
158- def train (self ,features ,labels ):
156+ def train (self , features , labels ):
159157
160- self ._init_parameters_ (features ,labels )
158+ self ._init_parameters_ (features , labels )
161159
162- for times in xrange (self .M ):
160+ for times in range (self .M ):
163161 logging .debug ('iterater %d' % times )
164162
165163 time1 = time .time ()
166164 map_time = 0
167165
168- best_classifier = (100000 ,None ,None ) # (误差率,针对的特征,分类器)
169- for i in xrange (self .n ):
166+ best_classifier = (100000 , None , None ) # (误差率,针对的特征,分类器)
167+ for i in range (self .n ):
170168 map_time -= time .time ()
171- features = map (lambda x :x [i ],self .X )
169+ features = map (lambda x : x [i ], self .X )
172170 map_time += time .time ()
173- classifier = Sign (features ,self .Y ,self .w )
171+ classifier = Sign (features , self .Y , self .w )
174172 error_score = classifier .train ()
175173
176174 if error_score < best_classifier [0 ]:
177- best_classifier = (error_score ,i , classifier )
175+ best_classifier = (error_score , i , classifier )
178176
179177 em = best_classifier [0 ]
180178
181179 # 分析用,之后删除 开始
182- print 'em is %s, index is %d' % (str (em ),best_classifier [1 ])
180+ print ( 'em is %s, index is %d' % (str (em ), best_classifier [1 ]) )
183181 time2 = time .time ()
184182 global sign_time_count
185- print '总运行时间:%s, 那两段关键代码运行时间:%s, map的时间是:%s' % (str (time2 - time1 ),str (sign_time_count ),str (map_time ))
183+ print ( '总运行时间:%s, 那两段关键代码运行时间:%s, map的时间是:%s' % (str (time2 - time1 ), str (sign_time_count ), str (map_time ) ))
186184 sign_time_count = 0
187185 # 分析用,之后删除 结束
188186
189- if em == 0 :
187+ if em == 0 :
190188 self .alpha .append (100 )
191189 else :
192- self .alpha .append (0.5 * math .log ((1 - em )/ em ))
190+ self .alpha .append (0.5 * math .log ((1 - em ) / em ))
193191
194192 self .classifier .append (best_classifier [1 :])
195193
196- Z = self ._Z_ (best_classifier [1 ],best_classifier [2 ])
194+ Z = self ._Z_ (best_classifier [1 ], best_classifier [2 ])
197195
198196 # 计算训练集权值分布 8.4
199- for i in xrange (self .N ):
200- self .w [i ] = self ._w_ (best_classifier [1 ],best_classifier [2 ],i ) / Z
197+ for i in range (self .N ):
198+ self .w [i ] = self ._w_ (best_classifier [1 ], best_classifier [2 ], i ) / Z
201199
202- def _predict_ (self ,feature ):
200+ def _predict_ (self , feature ):
203201
204202 result = 0.0
205- for i in xrange (self .M ):
203+ for i in range (self .M ):
206204 index = self .classifier [i ][0 ]
207205 classifier = self .classifier [i ][1 ]
208206
209- result += self .alpha [i ]* classifier .predict (feature [index ])
207+ result += self .alpha [i ] * classifier .predict (feature [index ])
210208
211- if result > 0 :
209+ if result > 0 :
212210 return 1
213211 return - 1
214212
215-
216-
217- def predict (self ,features ):
213+ def predict (self , features ):
218214 results = []
219215
220216 for feature in features :
221217 results .append (self ._predict_ (feature ))
222218
223219 return results
224220
221+
225222# 二值化
226223def binaryzation (img ):
227224 cv_img = img .astype (np .uint8 )
228- cv2 .threshold (cv_img ,50 ,1 , cv2 .cv .CV_THRESH_BINARY_INV ,cv_img )
225+ cv2 .threshold (cv_img , 50 , 1 , cv2 .cv .CV_THRESH_BINARY_INV , cv_img )
229226 return cv_img
230227
228+
231229def binaryzation_features (trainset ):
232230 features = []
233231
234232 for img in trainset :
235- img = np .reshape (img ,(28 ,28 ))
233+ img = np .reshape (img , (28 , 28 ))
236234 cv_img = img .astype (np .uint8 )
237235
238236 img_b = binaryzation (cv_img )
239237 # hog_feature = np.transpose(hog_feature)
240238 features .append (img_b )
241239
242240 features = np .array (features )
243- features = np .reshape (features ,(- 1 ,784 ))
241+ features = np .reshape (features , (- 1 , 784 ))
244242
245243 return features
246244
245+
247246if __name__ == '__main__' :
248247 logger = logging .getLogger ()
249248 logger .setLevel (logging .DEBUG )
250249
251- print 'Start read data'
250+ print ( 'Start read data' )
252251
253252 time_1 = time .time ()
254253
255- raw_data = pd .read_csv ('../data/train_binary.csv' ,header = 0 )
254+ raw_data = pd .read_csv ('../data/train_binary.csv' , header = 0 )
256255 data = raw_data .values
257256
258- imgs = data [0 ::,1 ::]
259- labels = data [::,0 ]
260-
257+ imgs = data [0 ::, 1 ::]
258+ labels = data [::, 0 ]
261259
262260 # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
263261 features = binaryzation_features (imgs )
264- train_features , test_features , train_labels , test_labels = train_test_split (features , labels , test_size = 0.5 , random_state = 0 )
262+ train_features , test_features , train_labels , test_labels = train_test_split (features , labels , test_size = 0.5 ,
263+ random_state = 0 )
265264
266265 time_2 = time .time ()
267- print 'read data cost ' ,time_2 - time_1 ,' second' ,'\n '
266+ print ( 'read data cost ' , time_2 - time_1 , ' second' , '\n ' )
268267
269- print 'Start training'
270- train_labels = map (lambda x :2 * x - 1 , train_labels )
268+ print ( 'Start training' )
269+ train_labels = map (lambda x : 2 * x - 1 , train_labels )
271270 ada = AdaBoost ()
272271 ada .train (train_features , train_labels )
273272
274273 time_3 = time .time ()
275- print 'training cost ' ,time_3 - time_2 ,' second' ,'\n '
274+ print ( 'training cost ' , time_3 - time_2 , ' second' , '\n ' )
276275
277- print 'Start predicting'
276+ print ( 'Start predicting' )
278277 test_predict = ada .predict (test_features )
279278 time_4 = time .time ()
280- print 'predicting cost ' ,time_4 - time_3 ,' second' ,'\n '
279+ print ( 'predicting cost ' , time_4 - time_3 , ' second' , '\n ' )
281280
282- test_labels = map (lambda x :2 * x - 1 , test_labels )
283- score = accuracy_score (test_labels ,test_predict )
284- print "The accruacy socre is " , score
281+ test_labels = map (lambda x : 2 * x - 1 , test_labels )
282+ score = accuracy_score (test_labels , test_predict )
283+ print ( "The accruacy socre is " , score )
0 commit comments