33# @Date: 05-11-16
4455# @Last modified by: WenDesi
6- # @Last modified time: 06 -11-16
6+ # @Last modified time: 09 -11-16
77
88
9+ import pandas as pd
10+ import numpy as np
11+
12+ import time
913import math
1014import random
1115
1216from collections import defaultdict
1317
18+ from sklearn .cross_validation import train_test_split
19+ from sklearn .metrics import accuracy_score
1420
1521
1622class MaxEnt (object ):
@@ -21,9 +27,10 @@ def init_params(self, X, Y):
2127
2228 self .cal_Pxy_Px (X , Y )
2329
24- self .N = len (X )
25- self .n = len (self .Pxy )
26- self .M = 2.0
30+ self .N = len (X ) # 训练集大小
31+ self .n = len (self .Pxy ) # 书中(x,y)对数
32+ self .M = 10000.0 # 书91页那个M,但实际操作中并没有用那个值
33+ # 可认为是学习速率
2734
2835 self .build_dict ()
2936 self .cal_EPxy ()
@@ -49,29 +56,37 @@ def cal_Pxy_Px(self, X, Y):
4956 self .Px [x ] += 1
5057
5158 def cal_EPxy (self ):
59+ '''
60+ 计算书中82页最下面那个期望
61+ '''
5262 self .EPxy = defaultdict (float )
5363 for id in xrange (self .n ):
5464 (x , y ) = self .id2xy [id ]
5565 self .EPxy [id ] = float (self .Pxy [(x , y )]) / float (self .N )
5666
57- def cal_pyx (self ,X , y ):
67+ def cal_pyx (self , X , y ):
5868 result = 0.0
5969 for x in X :
6070 if self .fxy (x , y ):
6171 id = self .xy2id [(x , y )]
6272 result += self .w [id ]
63- return (math .exp (result ),y )
73+ return (math .exp (result ), y )
6474
6575 def cal_probality (self , X ):
66- Pyxs = [(self .cal_pyx (X ,y )) for y in self .Y_ ]
76+ '''
77+ 计算书85页公式6.22
78+ '''
79+ Pyxs = [(self .cal_pyx (X , y )) for y in self .Y_ ]
6780 Z = sum ([prob for prob , y in Pyxs ])
68- return [(prob / Z ,y ) for prob ,y in Pyxs ]
69-
81+ return [(prob / Z , y ) for prob , y in Pyxs ]
7082
7183 def cal_EPx (self ):
84+ '''
85+ 计算书83页最上面那个期望
86+ '''
7287 self .EPx = [0.0 for i in xrange (self .n )]
7388
74- for i ,X in enumerate (self .X_ ):
89+ for i , X in enumerate (self .X_ ):
7590 Pyxs = self .cal_probality (X )
7691
7792 for x in X :
@@ -98,8 +113,8 @@ def train(self, X, Y):
98113 sigma = 1 / self .M * math .log (self .EPxy [i ] / self .EPx [i ])
99114 sigmas .append (sigma )
100115
101- if len (filter (lambda x : abs (x ) >= 0.01 , sigmas )) == 0 :
102- break
116+ # if len(filter(lambda x: abs(x) >= 0.01, sigmas)) == 0:
117+ # break
103118
104119 self .w = [self .w [i ] + sigmas [i ] for i in xrange (self .n )]
105120
@@ -111,67 +126,53 @@ def predict(self, testset):
111126 return results
112127
113128
114- def build_dataset (label ,original_posins ,radius ,size ):
115- datasets = []
116- dim = len (original_posins )
117-
118- for i in xrange (size ):
119- dataset = [label ]
120- for j in xrange (dim ):
121- point = random .randint (0 ,2 * radius )- radius + original_posins [j ]
122- dataset .append (point )
123- datasets .append (dataset )
124-
125- return datasets
126-
127-
128-
129129def rebuild_features (features ):
130+ '''
131+ 将原feature的(a0,a1,a2,a3,a4,...)
132+ 变成 (0_a0,1_a1,2_a2,3_a3,4_a4,...)形式
133+ '''
130134 new_features = []
131135 for feature in features :
132136 new_feature = []
133- for i ,f in enumerate (feature ):
134- new_feature .append (str (i )+ '_' + str (f ))
137+ for i , f in enumerate (feature ):
138+ new_feature .append (str (i ) + '_' + str (f ))
135139 new_features .append (new_feature )
136140 return new_features
137141
138142
139-
140-
141-
142-
143143if __name__ == "__main__" :
144144
145- # 构建训练集
146- trainset1 = build_dataset (0 ,[0 ,0 ],10 ,100 )
147- trainset2 = build_dataset (1 ,[30 ,30 ],10 ,100 )
145+ print 'Start read data'
148146
149- trainset = trainset1
150- trainset .extend (trainset2 )
151- random .shuffle (trainset )
147+ time_1 = time .time ()
152148
153- trainset_features = rebuild_features ( map ( lambda x : x [ 1 :], trainset ) )
154- trainset_labels = map ( lambda x : x [ 0 ], trainset )
149+ raw_data = pd . read_csv ( '../data/train_binary.csv' , header = 0 )
150+ data = raw_data . values
155151
156- # 训练
157- met = MaxEnt ()
158- met .train (trainset_features ,trainset_labels )
152+ imgs = data [0 ::, 1 ::]
153+ labels = data [::, 0 ]
159154
160- # 构建测试集
161- testset1 = build_dataset ( 0 ,[ 0 , 0 ], 15 , 500 )
162- testset2 = build_dataset ( 1 ,[ 30 , 30 ], 15 , 500 )
155+ # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
156+ train_features , test_features , train_labels , test_labels = train_test_split (
157+ imgs , labels , test_size = 0.33 , random_state = 23323 )
163158
164- testset = testset1
165- testset .extend (testset2 )
166- random .shuffle (testset )
159+ train_features = rebuild_features (train_features )
160+ test_features = rebuild_features (test_features )
167161
168- testset_features = rebuild_features ( map ( lambda x : x [ 1 :], testset ) )
169- testset_labels = map ( lambda x : x [ 0 ], testset )
162+ time_2 = time . time ( )
163+ print 'read data cost ' , time_2 - time_1 , ' second' , ' \n '
170164
171- # 测试
172- testset_predicts = met .predict (testset_features )
173- accuracy_score = float (len (filter (lambda x :x == True ,[testset_labels [i ]== testset_predicts [i ] for i in xrange (len (testset_predicts ))])))/ float (len (testset_predicts ))
174- print "The accruacy socre is " , accuracy_score
165+ print 'Start training'
166+ met = MaxEnt ()
167+ met .train (train_features , train_labels )
175168
169+ time_3 = time .time ()
170+ print 'training cost ' , time_3 - time_2 , ' second' , '\n '
176171
172+ print 'Start predicting'
173+ test_predict = met .predict (test_features )
174+ time_4 = time .time ()
175+ print 'predicting cost ' , time_4 - time_3 , ' second' , '\n '
177176
177+ score = accuracy_score (test_labels , test_predict )
178+ print "The accruacy socre is " , score
0 commit comments