1+ #encoding=utf-8
2+
3+ import pandas as pd
4+ import numpy as np
5+ import cv2
6+ import random
7+ import time
8+
9+ from sklearn .cross_validation import train_test_split
10+ from sklearn .metrics import accuracy_score
11+
12+ # 二值化
13+ def binaryzation (img ):
14+ cv_img = img .astype (np .uint8 )
15+ cv2 .threshold (cv_img ,50 ,1 ,cv2 .cv .CV_THRESH_BINARY_INV ,cv_img )
16+ return cv_img
17+
18+ def Train (trainset ,train_labels ):
19+ prior_probability = np .zeros (class_num ) # 先验概率
20+ conditional_probability = np .zeros ((class_num ,feature_len ,2 )) # 条件概率
21+
22+ # 计算先验概率及条件概率
23+ for i in range (len (train_labels )):
24+ img = binaryzation (trainset [i ]) # 图片二值化
25+ label = train_labels [i ]
26+
27+ prior_probability [label ] += 1
28+
29+ for j in range (feature_len ):
30+ conditional_probability [label ][j ][img [j ]] += 1
31+
32+ # 将概率归到[1.10001]
33+ for i in range (class_num ):
34+ for j in range (feature_len ):
35+
36+ # 经过二值化后图像只有0,1两种取值
37+ pix_0 = conditional_probability [i ][j ][0 ]
38+ pix_1 = conditional_probability [i ][j ][1 ]
39+
40+ # 计算0,1像素点对应的条件概率
41+ probalility_0 = (float (pix_0 )/ float (pix_0 + pix_1 ))* 1000000 + 1
42+ probalility_1 = (float (pix_1 )/ float (pix_0 + pix_1 ))* 1000000 + 1
43+
44+ conditional_probability [i ][j ][0 ] = probalility_0
45+ conditional_probability [i ][j ][1 ] = probalility_1
46+
47+ return prior_probability ,conditional_probability
48+
49+ # 计算概率
50+ def calculate_probability (img ,label ):
51+ probability = int (prior_probability [label ])
52+
53+ for i in range (len (img )):
54+ probability *= int (conditional_probability [label ][i ][img [i ]])
55+
56+ return probability
57+
58+ def Predict (testset ,prior_probability ,conditional_probability ):
59+ predict = []
60+
61+ for img in testset :
62+
63+ # 图像二值化
64+ img = binaryzation (img )
65+
66+ max_label = 0
67+ max_probability = calculate_probability (img ,0 )
68+
69+ for j in range (1 ,10 ):
70+ probability = calculate_probability (img ,j )
71+
72+ if max_probability < probability :
73+ max_label = j
74+ max_probability = probability
75+
76+ predict .append (max_label )
77+
78+ return np .array (predict )
79+
80+
81+ class_num = 10
82+ feature_len = 784
83+
84+ if __name__ == '__main__' :
85+
86+ print 'Start read data'
87+
88+ time_1 = time .time ()
89+
90+ raw_data = pd .read_csv ('../data/train.csv' ,header = 0 )
91+ data = raw_data .values
92+
93+ imgs = data [0 ::,1 ::]
94+ labels = data [::,0 ]
95+
96+ # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
97+ train_features , test_features , train_labels , test_labels = train_test_split (imgs , labels , test_size = 0.33 , random_state = 23323 )
98+ # print train_features.shape
99+ # print train_features.shape
100+
101+ time_2 = time .time ()
102+ print 'read data cost ' ,time_2 - time_1 ,' second' ,'\n '
103+
104+ print 'Start training'
105+ prior_probability ,conditional_probability = Train (train_features ,train_labels )
106+ time_3 = time .time ()
107+ print 'training cost ' ,time_3 - time_2 ,' second' ,'\n '
108+
109+ print 'Start predicting'
110+ test_predict = Predict (test_features ,prior_probability ,conditional_probability )
111+ time_4 = time .time ()
112+ print 'predicting cost ' ,time_4 - time_3 ,' second' ,'\n '
113+
114+ score = accuracy_score (test_labels ,test_predict )
115+ print "The accruacy socre is " , score
0 commit comments