Skip to content

Commit bffeefe

Browse files
committed
add DDPG
1 parent 780dcd9 commit bffeefe

File tree

6 files changed

+314
-453
lines changed

6 files changed

+314
-453
lines changed

Reinforcement_learning_TUT/7_Policy_gradient_softmax/run_CartPole.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,6 @@
4040

4141
observation_, reward, done, info = env.step(action)
4242

43-
# x, x_dot, theta, theta_dot = observation_
44-
# # the smaller theta and closer to center the better
45-
# r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.5
46-
# r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
47-
# reward = r1 + r2
48-
4943
RL.store_transition(observation, action, reward)
5044

5145
if done:

Reinforcement_learning_TUT/8_Actor_Critic_Advantage/AC_CartPole.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ def __init__(self, sess, n_features, lr=0.01):
7474
l1 = tf.layers.dense(
7575
inputs=self.state,
7676
units=20, # number of hidden units
77-
activation=tf.nn.relu,
77+
activation=tf.nn.relu, # None
78+
# have to be linear to make sure the convergence of actor.
79+
# But linear approximator seems hardly learns the correct Q.
7880
kernel_initializer=tf.random_normal_initializer(0., .1), # weights
7981
bias_initializer=tf.constant_initializer(0.1), # biases
8082
name='l1'
@@ -104,28 +106,34 @@ def update(self, s, r, s_):
104106
return td_error, loss
105107

106108

109+
110+
# Superparameters
111+
107112
OUTPUT_GRAPH = False
113+
MAX_EPISODE = 3000
108114
DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold
109-
EPISODE_TIME_THRESHOLD = 1000
115+
MAX_EP_STEPS = 1000 # maximum time step in one episode
110116
RENDER = False # rendering wastes time
111-
GAMMA = 0.9
117+
GAMMA = 0.9 # reward discount in TD error
118+
LR_A = 0.001 # learning rate for actor
119+
LR_C = 0.01 # learning rate for critic
112120

113121
env = gym.make('CartPole-v0')
114122
env.seed(1) # reproducible
115123

116124
sess = tf.Session()
117125

118126
with tf.variable_scope('Actor'):
119-
actor = Actor(sess, n_features=env.observation_space.shape[0], n_actions=env.action_space.n, lr=0.001)
127+
actor = Actor(sess, n_features=env.observation_space.shape[0], n_actions=env.action_space.n, lr=LR_A)
120128
with tf.variable_scope('Critic'):
121-
critic = Critic(sess, n_features=env.observation_space.shape[0], lr=0.01) # we need a good teacher, so the teacher should learn faster than the actor
129+
critic = Critic(sess, n_features=env.observation_space.shape[0], lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor
122130

123131
sess.run(tf.global_variables_initializer())
124132

125133
if OUTPUT_GRAPH:
126134
tf.summary.FileWriter("logs/", sess.graph)
127135

128-
for i_episode in range(3000):
136+
for i_episode in range(MAX_EPISODE):
129137
s = env.reset()
130138
t = 0
131139
track_r = []
@@ -146,7 +154,7 @@ def update(self, s, r, s_):
146154
s = s_
147155
t += 1
148156

149-
if done or t >= EPISODE_TIME_THRESHOLD:
157+
if done or t >= MAX_EP_STEPS:
150158
ep_rs_sum = sum(track_r)
151159

152160
if 'running_reward' not in globals():

Reinforcement_learning_TUT/8_Actor_Critic_Advantage/AC_continue_Pendulum.py

Lines changed: 50 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121

2222

2323
class Actor(object):
24-
def __init__(self, n_features, action_range, lr=0.0001):
24+
def __init__(self, sess, n_features, action_range, lr=0.0001):
25+
self.sess = sess
2526
with tf.name_scope('inputs'):
2627
self.state = tf.placeholder(tf.float32, [n_features, ], "state")
2728
state = tf.expand_dims(self.state, axis=0)
@@ -79,7 +80,8 @@ def choose_action(self, s):
7980

8081

8182
class Critic(object):
82-
def __init__(self, n_features, lr=0.01):
83+
def __init__(self, sess, n_features, lr=0.01):
84+
self.sess = sess
8385
with tf.name_scope('inputs'):
8486
self.state = tf.placeholder(tf.float32, [n_features, ], "state")
8587
state = tf.expand_dims(self.state, axis=0)
@@ -118,53 +120,56 @@ def evaluate(self, s):
118120

119121

120122
OUTPUT_GRAPH = False
123+
MAX_EPISODE = 3000
121124
EPISODE_TIME_THRESHOLD = 300
122125
DISPLAY_REWARD_THRESHOLD = -550 # renders environment if total episode reward is greater then this threshold
123126
RENDER = False # rendering wastes time
124127
GAMMA = 0.9
128+
LR_A = 0.001 # learning rate for actor
129+
LR_C = 0.01 # learning rate for critic
125130

126131
env = gym.make('Pendulum-v0')
127-
# env.seed(1) # reproducible
128-
129-
actor = Actor(n_features=env.observation_space.shape[0], action_range=[env.action_space.low[0], env.action_space.high[0]], lr=0.001)
130-
critic = Critic(n_features=env.observation_space.shape[0], lr=0.002)
131-
132-
with tf.Session() as sess:
133-
if OUTPUT_GRAPH:
134-
tf.summary.FileWriter("logs/", sess.graph)
135-
136-
actor.sess, critic.sess = sess, sess # define the tf session
137-
tf.global_variables_initializer().run()
138-
139-
for i_episode in range(3000):
140-
observation = env.reset()
141-
t = 0
142-
ep_rs = []
143-
while True:
144-
# if RENDER:
145-
env.render()
146-
action, mu, sigma = actor.choose_action(observation)
147-
148-
observation_, reward, done, info = env.step(action)
149-
reward /= 10
150-
TD_target = reward + GAMMA * critic.evaluate(observation_) # r + gamma * V_next
151-
TD_eval = critic.evaluate(observation) # V_now
152-
TD_error = TD_target - TD_eval
153-
154-
actor.update(s=observation, a=action, adv=TD_error)
155-
critic.update(s=observation, target=TD_target)
156-
157-
observation = observation_
158-
t += 1
159-
# print(reward)
160-
ep_rs.append(reward)
161-
if t > EPISODE_TIME_THRESHOLD:
162-
ep_rs_sum = sum(ep_rs)
163-
if 'running_reward' not in globals():
164-
running_reward = ep_rs_sum
165-
else:
166-
running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
167-
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
168-
print("episode:", i_episode, " reward:", int(running_reward))
169-
break
132+
env.seed(1) # reproducible
133+
134+
sess = tf.Session()
135+
136+
actor = Actor(sess, n_features=env.observation_space.shape[0], action_range=[env.action_space.low[0], env.action_space.high[0]], lr=LR_A)
137+
critic = Critic(sess, n_features=env.observation_space.shape[0], lr=LR_C)
138+
139+
sess.run(tf.global_variables_initializer())
140+
141+
if OUTPUT_GRAPH:
142+
tf.summary.FileWriter("logs/", sess.graph)
143+
144+
for i_episode in range(MAX_EPISODE):
145+
s = env.reset()
146+
t = 0
147+
ep_rs = []
148+
while True:
149+
# if RENDER:
150+
env.render()
151+
a, mu, sigma = actor.choose_action(s)
152+
153+
s_, r, done, info = env.step(a)
154+
r /= 10
155+
TD_target = r + GAMMA * critic.evaluate(s_) # r + gamma * V_next
156+
TD_eval = critic.evaluate(s) # V_now
157+
TD_error = TD_target - TD_eval
158+
159+
actor.update(s=s, a=a, adv=TD_error)
160+
critic.update(s=s, target=TD_target)
161+
162+
s = s_
163+
t += 1
164+
# print(reward)
165+
ep_rs.append(r)
166+
if t > EPISODE_TIME_THRESHOLD:
167+
ep_rs_sum = sum(ep_rs)
168+
if 'running_reward' not in globals():
169+
running_reward = ep_rs_sum
170+
else:
171+
running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
172+
if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering
173+
print("episode:", i_episode, " reward:", int(running_reward))
174+
break
170175

0 commit comments

Comments
 (0)