22
33import cv2
44import time
5- import math
5+ import logging
66import numpy as np
77import pandas as pd
88
99
1010from sklearn .cross_validation import train_test_split
1111from sklearn .metrics import accuracy_score
1212
13+
1314total_class = 10
1415
16+ def log (func ):
17+ def wrapper (* args , ** kwargs ):
18+ start_time = time .time ()
19+ logging .debug ('start %s()' % func .__name__ )
20+ ret = func (* args , ** kwargs )
21+
22+ end_time = time .time ()
23+ logging .debug ('end %s(), cost %s seconds' % (func .__name__ ,end_time - start_time ))
24+
25+ return ret
26+ return wrapper
27+
28+
1529# 二值化
1630def binaryzation (img ):
1731 cv_img = img .astype (np .uint8 )
1832 cv2 .threshold (cv_img ,50 ,1 ,cv2 .cv .CV_THRESH_BINARY_INV ,cv_img )
1933 return cv_img
2034
35+ @log
2136def binaryzation_features (trainset ):
2237 features = []
2338
@@ -49,8 +64,6 @@ def predict(self,features):
4964 if self .node_type == 'leaf' :
5065 return self .Class
5166
52- print 'in'
53-
5467 tree = self .dict [features [self .feature ]]
5568 return tree .predict (features )
5669
@@ -94,34 +107,22 @@ def calc_ent_grap(x,y):
94107
95108 return ent_grap
96109
97- def train (train_set ,train_label ,features ,epsilon ):
110+ def recurse_train (train_set ,train_label ,features ,epsilon ):
98111 global total_class
99112
100113 LEAF = 'leaf'
101114 INTERNAL = 'internal'
102115
103-
104116 # 步骤1——如果train_set中的所有实例都属于同一类Ck
105- label_dict = [0 for i in xrange (total_class )]
106- for label in train_label :
107- label_dict [label ] += 1
108-
109- for label , label_count in enumerate (label_dict ):
110- if label_count == len (train_label ):
111- tree = Tree (LEAF ,Class = label )
112- return tree
117+ label_set = set (train_label )
118+ if len (label_set ) == 1 :
119+ return Tree (LEAF ,Class = label_set .pop ())
113120
114121 # 步骤2——如果features为空
115- max_len ,max_class = 0 ,0
116- for i in xrange (total_class ):
117- class_i = filter (lambda x :x == i ,train_label )
118- if len (class_i ) > max_len :
119- max_class = i
120- max_len = len (class_i )
122+ (max_class ,max_len ) = max ([(i ,len (filter (lambda x :x == i ,train_label ))) for i in xrange (total_class )],key = lambda x :x [1 ])
121123
122124 if len (features ) == 0 :
123- tree = Tree (LEAF ,Class = max_class )
124- return tree
125+ return Tree (LEAF ,Class = max_class )
125126
126127 # 步骤3——计算信息增益
127128 max_feature = 0
@@ -138,8 +139,7 @@ def train(train_set,train_label,features,epsilon):
138139
139140 # 步骤4——小于阈值
140141 if max_gda < epsilon :
141- tree = Tree (LEAF ,Class = max_class )
142- return tree
142+ return Tree (LEAF ,Class = max_class )
143143
144144 # 步骤5——构建非空子集
145145 sub_features = filter (lambda x :x != max_feature ,features )
@@ -157,11 +157,16 @@ def train(train_set,train_label,features,epsilon):
157157 sub_train_set = train_set [index ]
158158 sub_train_label = train_label [index ]
159159
160- sub_tree = train (sub_train_set ,sub_train_label ,sub_features ,epsilon )
160+ sub_tree = recurse_train (sub_train_set ,sub_train_label ,sub_features ,epsilon )
161161 tree .add_tree (feature_value ,sub_tree )
162162
163163 return tree
164164
165+ @log
166+ def train (train_set ,train_label ,features ,epsilon ):
167+ return recurse_train (train_set ,train_label ,features ,epsilon )
168+
169+ @log
165170def predict (test_set ,tree ):
166171
167172 result = []
@@ -173,61 +178,25 @@ def predict(test_set,tree):
173178
174179
175180if __name__ == '__main__' :
176- # classes = [0,0,1,1,0,0,0,1,1,1,1,1,1,1,0]
177- #
178- # age = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2]
179- # occupation = [0,0,1,1,0,0,0,1,0,0,0,0,1,1,0]
180- # house = [0,0,0,1,0,0,0,1,1,1,1,1,0,0,0]
181- # loan = [0,1,1,0,0,0,1,1,2,2,2,1,1,2,0]
182- #
183- # features = []
184- #
185- # for i in range(15):
186- # feature = [age[i],occupation[i],house[i],loan[i]]
187- # features.append(feature)
188- #
189- # trainset = np.array(features)
190- #
191- # tree = train(trainset,np.array(classes),[0,1,2,3],0.1)
192- #
193- # print type(tree)
194- # features = [0,0,0,1]
195- # print tree.predict(np.array(features))
196-
197-
198- print 'Start read data'
199-
200- time_1 = time .time ()
181+ logger = logging .getLogger ()
182+ logger .setLevel (logging .DEBUG )
201183
202184 raw_data = pd .read_csv ('../data/train.csv' ,header = 0 )
203185 data = raw_data .values
204186
205187 imgs = data [0 ::,1 ::]
206188 labels = data [::,0 ]
207189
190+ # 图片二值化
208191 features = binaryzation_features (imgs )
209192
210193 # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
211194 train_features , test_features , train_labels , test_labels = train_test_split (features , labels , test_size = 0.33 , random_state = 23323 )
212- # print train_features.shape
213- # print train_features.shape
214-
215- time_2 = time .time ()
216- print 'read data cost ' ,time_2 - time_1 ,' second' ,'\n '
217195
218- print 'Start training'
219- tree = train (train_features ,train_labels ,[i for i in range (784 )],0.2 )
220- print type (tree )
221- print 'knn do not need to train'
222- time_3 = time .time ()
223- print 'training cost ' ,time_3 - time_2 ,' second' ,'\n '
224-
225- print 'Start predicting'
196+ tree = train (train_features ,train_labels ,[i for i in range (784 )],0.1 )
226197 test_predict = predict (test_features ,tree )
227- time_4 = time .time ()
228- print 'predicting cost ' ,time_4 - time_3 ,' second' ,'\n '
229-
230198 score = accuracy_score (test_labels ,test_predict )
199+
231200 print "The accruacy socre is " , score
232201
233202
0 commit comments