33# @Date:   05-11-16 
4455# @Last modified by:   WenDesi 
6- # @Last modified time: 06 -11-16 
6+ # @Last modified time: 09 -11-16 
77
88
9+ import  pandas  as  pd 
10+ import  numpy  as  np 
11+ 
12+ import  time 
913import  math 
1014import  random 
1115
1216from  collections  import  defaultdict 
1317
18+ from  sklearn .cross_validation  import  train_test_split 
19+ from  sklearn .metrics  import  accuracy_score 
1420
1521
1622class  MaxEnt (object ):
@@ -21,9 +27,10 @@ def init_params(self, X, Y):
2127
2228        self .cal_Pxy_Px (X , Y )
2329
24-         self .N  =  len (X )
25-         self .n  =  len (self .Pxy )
26-         self .M  =  2.0 
30+         self .N  =  len (X )                 # 训练集大小 
31+         self .n  =  len (self .Pxy )          # 书中(x,y)对数 
32+         self .M  =  10000.0                 # 书91页那个M,但实际操作中并没有用那个值 
33+         # 可认为是学习速率 
2734
2835        self .build_dict ()
2936        self .cal_EPxy ()
@@ -49,29 +56,37 @@ def cal_Pxy_Px(self, X, Y):
4956                self .Px [x ] +=  1 
5057
5158    def  cal_EPxy (self ):
59+         ''' 
60+         计算书中82页最下面那个期望 
61+         ''' 
5262        self .EPxy  =  defaultdict (float )
5363        for  id  in  xrange (self .n ):
5464            (x , y ) =  self .id2xy [id ]
5565            self .EPxy [id ] =  float (self .Pxy [(x , y )]) /  float (self .N )
5666
57-     def  cal_pyx (self ,X , y ):
67+     def  cal_pyx (self ,  X ,  y ):
5868        result  =  0.0 
5969        for  x  in  X :
6070            if  self .fxy (x , y ):
6171                id  =  self .xy2id [(x , y )]
6272                result  +=  self .w [id ]
63-         return  (math .exp (result ),y )
73+         return  (math .exp (result ),  y )
6474
6575    def  cal_probality (self , X ):
66-         Pyxs  =  [(self .cal_pyx (X ,y )) for  y  in  self .Y_ ]
76+         ''' 
77+         计算书85页公式6.22 
78+         ''' 
79+         Pyxs  =  [(self .cal_pyx (X , y )) for  y  in  self .Y_ ]
6780        Z  =  sum ([prob  for  prob , y  in  Pyxs ])
68-         return  [(prob / Z ,y ) for  prob ,y  in  Pyxs ]
69- 
81+         return  [(prob  /  Z , y ) for  prob , y  in  Pyxs ]
7082
7183    def  cal_EPx (self ):
84+         ''' 
85+         计算书83页最上面那个期望 
86+         ''' 
7287        self .EPx  =  [0.0  for  i  in  xrange (self .n )]
7388
74-         for  i ,X  in  enumerate (self .X_ ):
89+         for  i ,  X  in  enumerate (self .X_ ):
7590            Pyxs  =  self .cal_probality (X )
7691
7792            for  x  in  X :
@@ -98,8 +113,8 @@ def train(self, X, Y):
98113                sigma  =  1  /  self .M  *  math .log (self .EPxy [i ] /  self .EPx [i ])
99114                sigmas .append (sigma )
100115
101-             if  len (filter (lambda  x : abs (x ) >=  0.01 , sigmas )) ==  0 :
102-                 break 
116+             #  if len(filter(lambda x: abs(x) >= 0.01, sigmas)) == 0:
117+             #      break
103118
104119            self .w  =  [self .w [i ] +  sigmas [i ] for  i  in  xrange (self .n )]
105120
@@ -111,67 +126,53 @@ def predict(self, testset):
111126        return  results 
112127
113128
114- def  build_dataset (label ,original_posins ,radius ,size ):
115-     datasets  =  []
116-     dim  =  len (original_posins )
117- 
118-     for  i  in  xrange (size ):
119-         dataset  =  [label ]
120-         for  j  in  xrange (dim ):
121-             point  =  random .randint (0 ,2 * radius )- radius + original_posins [j ]
122-             dataset .append (point )
123-         datasets .append (dataset )
124- 
125-     return  datasets 
126- 
127- 
128- 
129129def  rebuild_features (features ):
130+     ''' 
131+     将原feature的(a0,a1,a2,a3,a4,...) 
132+     变成 (0_a0,1_a1,2_a2,3_a3,4_a4,...)形式 
133+     ''' 
130134    new_features  =  []
131135    for  feature  in  features :
132136        new_feature  =  []
133-         for  i ,f  in  enumerate (feature ):
134-             new_feature .append (str (i )+ '_' + str (f ))
137+         for  i ,  f  in  enumerate (feature ):
138+             new_feature .append (str (i )  +   '_'   +   str (f ))
135139        new_features .append (new_feature )
136140    return  new_features 
137141
138142
139- 
140- 
141- 
142- 
143143if  __name__  ==  "__main__" :
144144
145-     # 构建训练集 
146-     trainset1  =  build_dataset (0 ,[0 ,0 ],10 ,100 )
147-     trainset2  =  build_dataset (1 ,[30 ,30 ],10 ,100 )
145+     print  'Start read data' 
148146
149-     trainset  =  trainset1 
150-     trainset .extend (trainset2 )
151-     random .shuffle (trainset )
147+     time_1  =  time .time ()
152148
153-     trainset_features  =  rebuild_features ( map ( lambda   x : x [ 1 :],  trainset ) )
154-     trainset_labels  =  map ( lambda   x : x [ 0 ],  trainset ) 
149+     raw_data  =  pd . read_csv ( '../data/train_binary.csv' ,  header = 0 )
150+     data  =  raw_data . values 
155151
156-     # 训练 
157-     met  =  MaxEnt ()
158-     met .train (trainset_features ,trainset_labels )
152+     imgs  =  data [0 ::, 1 ::]
153+     labels  =  data [::, 0 ]
159154
160-     # 构建测试集  
161-     testset1   =   build_dataset ( 0 ,[ 0 , 0 ], 15 , 500 ) 
162-     testset2   =   build_dataset ( 1 ,[ 30 , 30 ], 15 , 500 )
155+     # 选取 2/3 数据作为训练集, 1/3 数据作为测试集  
156+     train_features ,  test_features ,  train_labels ,  test_labels   =   train_test_split ( 
157+          imgs ,  labels ,  test_size = 0.33 ,  random_state = 23323 )
163158
164-     testset  =  testset1 
165-     testset .extend (testset2 )
166-     random .shuffle (testset )
159+     train_features  =  rebuild_features (train_features )
160+     test_features  =  rebuild_features (test_features )
167161
168-     testset_features  =  rebuild_features ( map ( lambda   x : x [ 1 :],  testset ) )
169-     testset_labels   =   map ( lambda   x : x [ 0 ],  testset ) 
162+     time_2  =  time . time ( )
163+     print   'read data cost ' ,  time_2   -   time_1 ,  ' second' ,  ' \n ' 
170164
171-     # 测试 
172-     testset_predicts  =  met .predict (testset_features )
173-     accuracy_score  =  float (len (filter (lambda  x :x == True ,[testset_labels [i ]== testset_predicts [i ] for  i  in  xrange (len (testset_predicts ))])))/ float (len (testset_predicts ))
174-     print  "The accruacy socre is " , accuracy_score 
165+     print  'Start training' 
166+     met  =  MaxEnt ()
167+     met .train (train_features , train_labels )
175168
169+     time_3  =  time .time ()
170+     print  'training cost ' , time_3  -  time_2 , ' second' , '\n ' 
176171
172+     print  'Start predicting' 
173+     test_predict  =  met .predict (test_features )
174+     time_4  =  time .time ()
175+     print  'predicting cost ' , time_4  -  time_3 , ' second' , '\n ' 
177176
177+     score  =  accuracy_score (test_labels , test_predict )
178+     print  "The accruacy socre is " , score 
0 commit comments