1+ #encoding=utf-8 
2+ 
3+ import  pandas  as  pd 
4+ import  numpy  as  np 
5+ import  cv2 
6+ import  random 
7+ import  time 
8+ 
9+ from  sklearn .cross_validation  import  train_test_split 
10+ from  sklearn .metrics  import  accuracy_score 
11+ 
12+ 
13+ # 利用opencv获取图像hog特征 
14+ def  get_hog_features (trainset ):
15+     features  =  []
16+ 
17+     hog  =  cv2 .HOGDescriptor ('../hog.xml' )
18+ 
19+     for  img  in  trainset :
20+         img  =  np .reshape (img ,(28 ,28 ))
21+         cv_img  =  img .astype (np .uint8 )
22+ 
23+         hog_feature  =  hog .compute (cv_img )
24+         # hog_feature = np.transpose(hog_feature) 
25+         features .append (hog_feature )
26+ 
27+     features  =  np .array (features )
28+     features  =  np .reshape (features ,(- 1 ,324 ))
29+ 
30+     return  features 
31+ 
32+ def  Predict (testset ,trainset ,train_labels ):
33+     predict  =  []
34+     count  =  0 
35+     for  test_vec  in  testset :
36+         print  count 
37+         count  +=  1 
38+ 
39+         knn_list  =  []
40+ 
41+         for  i  in  range (len (train_labels )):
42+             label  =  train_labels [i ]
43+             train_vec  =  trainset [i ]
44+ 
45+             dist  =  np .linalg .norm (train_vec  -  test_vec )
46+ 
47+             if  len (knn_list ) <  k :                               # 如果还不够10个邻近点则直接添加即可 
48+                 knn_list .append ((dist ,label ))
49+             else :
50+                 max_index  =  - 1 
51+                 max_dist  =  dist 
52+ 
53+                 # 寻找10个邻近点钟距离最远的点 
54+                 for  j  in  range (k ):
55+                     if  max_dist  <  knn_list [j ][0 ]:
56+                         max_index  =  j 
57+                         max_dist  =  knn_list [max_index ][0 ]
58+ 
59+                 if  max_index  >=  0 :
60+                     knn_list [max_index ] =  (dist ,label )
61+ 
62+         class_total  =  10 
63+         class_count  =  [0  for  i  in  range (class_total )]
64+         for  dist ,label  in  knn_list :
65+             class_count [label ] +=  1 
66+ 
67+         mmax  =  max (class_count )
68+ 
69+         for  i  in  range (class_total ):
70+             if  mmax  ==  class_count [i ]:
71+                 predict .append (i )
72+                 break 
73+ 
74+     return  np .array (predict )
75+ 
76+ k  =  10 
77+ 
78+ if  __name__  ==  '__main__' :
79+ 
80+     print  'Start read data' 
81+ 
82+     time_1  =  time .time ()
83+ 
84+     raw_data  =  pd .read_csv ('../data/train.csv' ,header = 0 )
85+     data  =  raw_data .values 
86+ 
87+     imgs  =  data [0 ::,1 ::]
88+     labels  =  data [::,0 ]
89+ 
90+     features  =  get_hog_features (imgs )
91+ 
92+     # 选取 2/3 数据作为训练集, 1/3 数据作为测试集 
93+     train_features , test_features , train_labels , test_labels  =  train_test_split (features , labels , test_size = 0.33 , random_state = 23323 )
94+     # print train_features.shape 
95+     # print train_features.shape 
96+ 
97+     time_2  =  time .time ()
98+     print  'read data cost ' ,time_2  -  time_1 ,' second' ,'\n ' 
99+ 
100+     print  'Start training' 
101+     print  'knn do not need to train' 
102+     time_3  =  time .time ()
103+     print  'training cost ' ,time_3  -  time_2 ,' second' ,'\n ' 
104+ 
105+     print  'Start predicting' 
106+     test_predict  =  Predict (test_features ,train_features ,train_labels )
107+     time_4  =  time .time ()
108+     print  'predicting cost ' ,time_4  -  time_3 ,' second' ,'\n ' 
109+ 
110+     score  =  accuracy_score (test_labels ,test_predict )
111+     print  "The accruacy socre is " , score 
0 commit comments