1+ #encoding=utf-8
2+
3+ import pandas as pd
4+ import numpy as np
5+ import cv2
6+ import random
7+ import time
8+
9+ from sklearn .cross_validation import train_test_split
10+ from sklearn .metrics import accuracy_score
11+
12+
13+ # 利用opencv获取图像hog特征
14+ def get_hog_features (trainset ):
15+ features = []
16+
17+ hog = cv2 .HOGDescriptor ('../hog.xml' )
18+
19+ for img in trainset :
20+ img = np .reshape (img ,(28 ,28 ))
21+ cv_img = img .astype (np .uint8 )
22+
23+ hog_feature = hog .compute (cv_img )
24+ # hog_feature = np.transpose(hog_feature)
25+ features .append (hog_feature )
26+
27+ features = np .array (features )
28+ features = np .reshape (features ,(- 1 ,324 ))
29+
30+ return features
31+
32+ def Predict (testset ,trainset ,train_labels ):
33+ predict = []
34+ count = 0
35+ for test_vec in testset :
36+ print count
37+ count += 1
38+
39+ knn_list = []
40+
41+ for i in range (len (train_labels )):
42+ label = train_labels [i ]
43+ train_vec = trainset [i ]
44+
45+ dist = np .linalg .norm (train_vec - test_vec )
46+
47+ if len (knn_list ) < k : # 如果还不够10个邻近点则直接添加即可
48+ knn_list .append ((dist ,label ))
49+ else :
50+ max_index = - 1
51+ max_dist = dist
52+
53+ # 寻找10个邻近点钟距离最远的点
54+ for j in range (k ):
55+ if max_dist < knn_list [j ][0 ]:
56+ max_index = j
57+ max_dist = knn_list [max_index ][0 ]
58+
59+ if max_index >= 0 :
60+ knn_list [max_index ] = (dist ,label )
61+
62+ class_total = 10
63+ class_count = [0 for i in range (class_total )]
64+ for dist ,label in knn_list :
65+ class_count [label ] += 1
66+
67+ mmax = max (class_count )
68+
69+ for i in range (class_total ):
70+ if mmax == class_count [i ]:
71+ predict .append (i )
72+ break
73+
74+ return np .array (predict )
75+
76+ k = 10
77+
78+ if __name__ == '__main__' :
79+
80+ print 'Start read data'
81+
82+ time_1 = time .time ()
83+
84+ raw_data = pd .read_csv ('../data/train.csv' ,header = 0 )
85+ data = raw_data .values
86+
87+ imgs = data [0 ::,1 ::]
88+ labels = data [::,0 ]
89+
90+ features = get_hog_features (imgs )
91+
92+ # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
93+ train_features , test_features , train_labels , test_labels = train_test_split (features , labels , test_size = 0.33 , random_state = 23323 )
94+ # print train_features.shape
95+ # print train_features.shape
96+
97+ time_2 = time .time ()
98+ print 'read data cost ' ,time_2 - time_1 ,' second' ,'\n '
99+
100+ print 'Start training'
101+ print 'knn do not need to train'
102+ time_3 = time .time ()
103+ print 'training cost ' ,time_3 - time_2 ,' second' ,'\n '
104+
105+ print 'Start predicting'
106+ test_predict = Predict (test_features ,train_features ,train_labels )
107+ time_4 = time .time ()
108+ print 'predicting cost ' ,time_4 - time_3 ,' second' ,'\n '
109+
110+ score = accuracy_score (test_labels ,test_predict )
111+ print "The accruacy socre is " , score
0 commit comments