11"""
22Actor-Critic with continuous action using TD-error as the Advantage, Reinforcement Learning.
33
4- The cart pole example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/CliffWalk %20Actor%20Critic%20Solution.ipynb)
4+ The cart pole example (based on https://github.com/dennybritz/reinforcement-learning/blob/master/PolicyGradient/Continuous%20MountainCar %20Actor%20Critic%20Solution.ipynb)
55
66Cannot converge!!!
77
2121
2222
2323class Actor (object ):
24- def __init__ (self , sess , n_features , action_range , lr = 0.0001 ):
24+ def __init__ (self , sess , n_features , action_bound , lr = 0.0001 ):
2525 self .sess = sess
26- with tf .name_scope ('inputs' ):
27- self .state = tf .placeholder (tf .float32 , [n_features , ], "state" )
28- state = tf .expand_dims (self .state , axis = 0 )
29- self .act = tf .placeholder (tf .float32 , name = "act" )
30- self .advantage = tf .placeholder (tf .float32 , name = "adv" ) # TD_error
26+
27+ self .s = tf .placeholder (tf .float32 , [1 , n_features ], "state" )
28+ self .a = tf .placeholder (tf .float32 , None , name = "act" )
29+ self .td_error = tf .placeholder (tf .float32 , None , name = "td_error" ) # TD_error
3130
3231 l1 = tf .layers .dense (
33- inputs = state ,
32+ inputs = self . s ,
3433 units = 30 , # number of hidden units
35- activation = None ,
34+ activation = tf . nn . relu ,
3635 kernel_initializer = tf .random_normal_initializer (0. , .1 ), # weights
3736 bias_initializer = tf .constant_initializer (0.1 ), # biases
3837 name = 'l1'
@@ -50,78 +49,83 @@ def __init__(self, sess, n_features, action_range, lr=0.0001):
5049 sigma = tf .layers .dense (
5150 inputs = l1 ,
5251 units = 1 , # output units
53- activation = tf .nn .relu , # get action probabilities
52+ activation = tf .nn .sigmoid , # get action probabilities
5453 kernel_initializer = tf .random_normal_initializer (0. , .1 ), # weights
5554 bias_initializer = tf .constant_initializer (1. ), # biases
5655 name = 'sigma'
5756 )
58-
59- self .mu , self .sigma = tf .squeeze (mu * 2 ), tf .squeeze (sigma + 1e-2 )
57+ global_step = tf .Variable (0 , trainable = False )
58+ # self.e = epsilon = tf.train.exponential_decay(2., global_step, 1000, 0.9)
59+ self .mu , self .sigma = tf .squeeze (mu * 2 ), tf .squeeze (sigma + 0.1 )
6060 self .normal_dist = tf .contrib .distributions .Normal (self .mu , self .sigma )
6161
62- self .action = tf .clip_by_value (self .normal_dist .sample (1 ), action_range [0 ], action_range [1 ])
62+ self .action = tf .clip_by_value (self .normal_dist .sample (1 ), action_bound [0 ], action_bound [1 ])
6363
64- with tf .name_scope ('loss ' ):
65- neg_log_prob = - self .normal_dist .log_prob (self .act ) # loss without advantage
66- self .loss = neg_log_prob * self .advantage # advantage (TD_error) guided loss
64+ with tf .name_scope ('exp_v ' ):
65+ log_prob = self .normal_dist .log_prob (self .a ) # loss without advantage
66+ self .exp_v = log_prob * self .td_error # advantage (TD_error) guided loss
6767 # Add cross entropy cost to encourage exploration
68- self .loss -= 1e-1 * self .normal_dist .entropy ()
68+ self .exp_v += self .normal_dist .entropy ()
6969
7070 with tf .name_scope ('train' ):
71- self .train_op = tf .train .AdamOptimizer (lr ).minimize (self .loss )
71+ self .train_op = tf .train .AdamOptimizer (lr ).minimize (- self .exp_v , global_step ) # min(v) = max(-v )
7272
73- def update (self , s , a , adv ):
74- feed_dict = {self .state : s , self .act : a , self .advantage : adv }
75- _ , loss = self .sess .run ([self .train_op , self .loss ], feed_dict )
76- return loss
73+ def learn (self , s , a , td ):
74+ s = s [np .newaxis , :]
75+ feed_dict = {self .s : s , self .a : a , self .td_error : td }
76+ _ , exp_v = self .sess .run ([self .train_op , self .exp_v ], feed_dict )
77+ return exp_v
7778
7879 def choose_action (self , s ):
79- return self .sess .run ([self .action , self .mu , self .sigma ], {self .state : s }) # get probabilities for all actions
80+ s = s [np .newaxis , :]
81+ return self .sess .run (self .action , {self .s : s }) # get probabilities for all actions
8082
8183
8284class Critic (object ):
8385 def __init__ (self , sess , n_features , lr = 0.01 ):
8486 self .sess = sess
8587 with tf .name_scope ('inputs' ):
86- self .state = tf .placeholder (tf .float32 , [n_features , ], "state" )
87- state = tf .expand_dims ( self . state , axis = 0 )
88- self .target = tf .placeholder (dtype = tf .float32 , name = "target" ) # TD target=r+gamma*V_next
88+ self .s = tf .placeholder (tf .float32 , [1 , n_features ], "state" )
89+ self . v_ = tf .placeholder ( tf . float32 , [ 1 , 1 ], name = "v_next" )
90+ self .r = tf .placeholder (tf .float32 , name = 'r' )
8991
9092 with tf .variable_scope ('Critic' ):
9193 l1 = tf .layers .dense (
92- inputs = state ,
94+ inputs = self . s ,
9395 units = 30 , # number of hidden units
94- activation = None ,
96+ activation = tf . nn . relu ,
9597 kernel_initializer = tf .random_normal_initializer (0. , .1 ), # weights
9698 bias_initializer = tf .constant_initializer (0.1 ), # biases
9799 name = 'l1'
98100 )
99101
100- self .eval = tf .layers .dense (
102+ self .v = tf .layers .dense (
101103 inputs = l1 ,
102104 units = 1 , # output units
103105 activation = None ,
104106 kernel_initializer = tf .random_normal_initializer (0. , .1 ), # weights
105107 bias_initializer = tf .constant_initializer (0.1 ), # biases
106- name = 'l2 '
108+ name = 'V '
107109 )
108110
109- with tf .name_scope ('loss' ):
110- self .loss = tf .reduce_mean (tf .squared_difference (self .target , self .eval )) # TD_error = (r+gamma*V_next) - V_eval
111- with tf .name_scope ('train' ):
112- self .train_op = tf .train .RMSPropOptimizer (lr ).minimize (self .loss )
111+ with tf .variable_scope ('squared_TD_error' ):
112+ self .td_error = tf .reduce_mean (self .r + GAMMA * self .v_ - self .v )
113+ self .loss = tf .square (self .td_error ) # TD_error = (r+gamma*V_next) - V_eval
114+ with tf .variable_scope ('train' ):
115+ self .train_op = tf .train .AdamOptimizer (lr ).minimize (self .loss )
113116
114- def update (self , s , target ):
115- _ , loss = self .sess .run ([self .train_op , self .loss ], {self .state : s , self .target : target })
116- return loss
117+ def learn (self , s , r , s_ ):
118+ s , s_ = s [np .newaxis , :], s_ [np .newaxis , :]
117119
118- def evaluate (self , s ):
119- return self .sess .run (self .eval , {self .state : s })[0 , 0 ] # return a float
120+ v_ = self .sess .run (self .v , {self .s : s_ })
121+ td_error , _ = self .sess .run ([self .td_error , self .train_op ],
122+ {self .s : s , self .v_ : v_ , self .r : r })
123+ return td_error
120124
121125
122126OUTPUT_GRAPH = False
123127MAX_EPISODE = 3000
124- EPISODE_TIME_THRESHOLD = 300
128+ MAX_EP_STEPS = 300
125129DISPLAY_REWARD_THRESHOLD = - 550 # renders environment if total episode reward is greater then this threshold
126130RENDER = False # rendering wastes time
127131GAMMA = 0.9
@@ -131,10 +135,13 @@ def evaluate(self, s):
131135env = gym .make ('Pendulum-v0' )
132136env .seed (1 ) # reproducible
133137
138+ N_S = env .observation_space .shape [0 ]
139+ A_BOUND = env .action_space .high
140+
134141sess = tf .Session ()
135142
136- actor = Actor (sess , n_features = env . observation_space . shape [ 0 ], action_range = [ env . action_space . low [ 0 ], env . action_space . high [ 0 ]], lr = LR_A )
137- critic = Critic (sess , n_features = env . observation_space . shape [ 0 ] , lr = LR_C )
143+ actor = Actor (sess , n_features = N_S , lr = LR_A , action_bound = [ - A_BOUND , A_BOUND ] )
144+ critic = Critic (sess , n_features = N_S , lr = LR_C )
138145
139146sess .run (tf .global_variables_initializer ())
140147
@@ -148,22 +155,18 @@ def evaluate(self, s):
148155 while True :
149156 # if RENDER:
150157 env .render ()
151- a , mu , sigma = actor .choose_action (s )
158+ a = actor .choose_action (s )
152159
153160 s_ , r , done , info = env .step (a )
154161 r /= 10
155- TD_target = r + GAMMA * critic .evaluate (s_ ) # r + gamma * V_next
156- TD_eval = critic .evaluate (s ) # V_now
157- TD_error = TD_target - TD_eval
158162
159- actor . update ( s = s , a = a , adv = TD_error )
160- critic . update ( s = s , target = TD_target )
163+ td_error = critic . learn ( s , r , s_ ) # gradient = grad[r + gamma * V(s_) - V(s)]
164+ actor . learn ( s , a , td_error ) # true_gradient = grad[logPi(s,a) * td_error]
161165
162166 s = s_
163167 t += 1
164- # print(reward)
165168 ep_rs .append (r )
166- if t > EPISODE_TIME_THRESHOLD :
169+ if t > MAX_EP_STEPS :
167170 ep_rs_sum = sum (ep_rs )
168171 if 'running_reward' not in globals ():
169172 running_reward = ep_rs_sum
0 commit comments