1+ #encoding=utf-8 
2+ 
3+ import  pandas  as  pd 
4+ import  numpy  as  np 
5+ import  cv2 
6+ import  random 
7+ import  time 
8+ 
9+ from  sklearn .cross_validation  import  train_test_split 
10+ from  sklearn .metrics  import  accuracy_score 
11+ 
12+ # 二值化 
13+ def  binaryzation (img ):
14+     cv_img  =  img .astype (np .uint8 )
15+     cv2 .threshold (cv_img ,50 ,1 ,cv2 .cv .CV_THRESH_BINARY_INV ,cv_img )
16+     return  cv_img 
17+ 
18+ def  Train (trainset ,train_labels ):
19+     prior_probability  =  np .zeros (class_num )                         # 先验概率 
20+     conditional_probability  =  np .zeros ((class_num ,feature_len ,2 ))   # 条件概率 
21+ 
22+     # 计算先验概率及条件概率 
23+     for  i  in  range (len (train_labels )):
24+         img  =  binaryzation (trainset [i ])     # 图片二值化 
25+         label  =  train_labels [i ]
26+ 
27+         prior_probability [label ] +=  1 
28+ 
29+         for  j  in  range (feature_len ):
30+             conditional_probability [label ][j ][img [j ]] +=  1 
31+ 
32+     # 将概率归到[1.10001] 
33+     for  i  in  range (class_num ):
34+         for  j  in  range (feature_len ):
35+ 
36+             # 经过二值化后图像只有0,1两种取值 
37+             pix_0  =  conditional_probability [i ][j ][0 ]
38+             pix_1  =  conditional_probability [i ][j ][1 ]
39+ 
40+             # 计算0,1像素点对应的条件概率 
41+             probalility_0  =  (float (pix_0 )/ float (pix_0 + pix_1 ))* 1000000  +  1 
42+             probalility_1  =  (float (pix_1 )/ float (pix_0 + pix_1 ))* 1000000  +  1 
43+ 
44+             conditional_probability [i ][j ][0 ] =  probalility_0 
45+             conditional_probability [i ][j ][1 ] =  probalility_1 
46+ 
47+     return  prior_probability ,conditional_probability 
48+ 
49+ # 计算概率 
50+ def  calculate_probability (img ,label ):
51+     probability  =  int (prior_probability [label ])
52+ 
53+     for  i  in  range (len (img )):
54+         probability  *=  int (conditional_probability [label ][i ][img [i ]])
55+ 
56+     return  probability 
57+ 
58+ def  Predict (testset ,prior_probability ,conditional_probability ):
59+     predict  =  []
60+ 
61+     for  img  in  testset :
62+ 
63+         # 图像二值化 
64+         img  =  binaryzation (img )
65+ 
66+         max_label  =  0 
67+         max_probability  =  calculate_probability (img ,0 )
68+ 
69+         for  j  in  range (1 ,10 ):
70+             probability  =  calculate_probability (img ,j )
71+ 
72+             if  max_probability  <  probability :
73+                 max_label  =  j 
74+                 max_probability  =  probability 
75+ 
76+         predict .append (max_label )
77+ 
78+     return  np .array (predict )
79+ 
80+ 
81+ class_num  =  10 
82+ feature_len  =  784 
83+ 
84+ if  __name__  ==  '__main__' :
85+ 
86+     print  'Start read data' 
87+ 
88+     time_1  =  time .time ()
89+ 
90+     raw_data  =  pd .read_csv ('../data/train.csv' ,header = 0 )
91+     data  =  raw_data .values 
92+ 
93+     imgs  =  data [0 ::,1 ::]
94+     labels  =  data [::,0 ]
95+ 
96+     # 选取 2/3 数据作为训练集, 1/3 数据作为测试集 
97+     train_features , test_features , train_labels , test_labels  =  train_test_split (imgs , labels , test_size = 0.33 , random_state = 23323 )
98+     # print train_features.shape 
99+     # print train_features.shape 
100+ 
101+     time_2  =  time .time ()
102+     print  'read data cost ' ,time_2  -  time_1 ,' second' ,'\n ' 
103+ 
104+     print  'Start training' 
105+     prior_probability ,conditional_probability  =  Train (train_features ,train_labels )
106+     time_3  =  time .time ()
107+     print  'training cost ' ,time_3  -  time_2 ,' second' ,'\n ' 
108+ 
109+     print  'Start predicting' 
110+     test_predict  =  Predict (test_features ,prior_probability ,conditional_probability )
111+     time_4  =  time .time ()
112+     print  'predicting cost ' ,time_4  -  time_3 ,' second' ,'\n ' 
113+ 
114+     score  =  accuracy_score (test_labels ,test_predict )
115+     print  "The accruacy socre is " , score 
0 commit comments