linushao
diff --git a/‎Reinforcement_learning_TUT/7_Policy_gradient_softmax/run_CartPole.py‎
Lines changed: 0 additions & 6 deletions b/‎Reinforcement_learning_TUT/7_Policy_gradient_softmax/run_CartPole.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎Reinforcement_learning_TUT/8_Actor_Critic_Advantage/AC_CartPole.py‎
Lines changed: 15 additions & 7 deletions b/‎Reinforcement_learning_TUT/8_Actor_Critic_Advantage/AC_CartPole.py‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎Reinforcement_learning_TUT/8_Actor_Critic_Advantage/AC_continue_Pendulum.py‎
Lines changed: 50 additions & 45 deletions b/‎Reinforcement_learning_TUT/8_Actor_Critic_Advantage/AC_continue_Pendulum.py‎
Lines changed: 50 additions & 45 deletions
@@ -40,12 +40,6 @@
 
         observation_, reward, done, info = env.step(action)
 
-        # x, x_dot, theta, theta_dot = observation_
-        # # the smaller theta and closer to center the better
-        # r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.5
-        # r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
-        # reward = r1 + r2
-
         RL.store_transition(observation, action, reward)
 
         if done:
 
@@ -74,7 +74,9 @@ def __init__(self, sess, n_features, lr=0.01):
         l1 = tf.layers.dense(
             inputs=self.state,
             units=20,  # number of hidden units
-            activation=tf.nn.relu,
+            activation=tf.nn.relu,  # None
+            # have to be linear to make sure the convergence of actor.
+            # But linear approximator seems hardly learns the correct Q.
             kernel_initializer=tf.random_normal_initializer(0., .1),  # weights
             bias_initializer=tf.constant_initializer(0.1),  # biases
             name='l1'
@@ -104,28 +106,34 @@ def update(self, s, r, s_):
         return td_error, loss
 
 
+
+# Superparameters
+
 OUTPUT_GRAPH = False
+MAX_EPISODE = 3000
 DISPLAY_REWARD_THRESHOLD = 200  # renders environment if total episode reward is greater then this threshold
-EPISODE_TIME_THRESHOLD = 1000
+MAX_EP_STEPS = 1000   # maximum time step in one episode
 RENDER = False  # rendering wastes time
-GAMMA = 0.9
+GAMMA = 0.9     # reward discount in TD error
+LR_A = 0.001    # learning rate for actor
+LR_C = 0.01     # learning rate for critic
 
 env = gym.make('CartPole-v0')
 env.seed(1)  # reproducible
 
 sess = tf.Session()
 
 with tf.variable_scope('Actor'):
-    actor = Actor(sess, n_features=env.observation_space.shape[0], n_actions=env.action_space.n, lr=0.001)
+    actor = Actor(sess, n_features=env.observation_space.shape[0], n_actions=env.action_space.n, lr=LR_A)
 with tf.variable_scope('Critic'):
-    critic = Critic(sess, n_features=env.observation_space.shape[0], lr=0.01)     # we need a good teacher, so the teacher should learn faster than the actor
+    critic = Critic(sess, n_features=env.observation_space.shape[0], lr=LR_C)     # we need a good teacher, so the teacher should learn faster than the actor
 
 sess.run(tf.global_variables_initializer())
 
 if OUTPUT_GRAPH:
     tf.summary.FileWriter("logs/", sess.graph)
 
-for i_episode in range(3000):
+for i_episode in range(MAX_EPISODE):
     s = env.reset()
     t = 0
     track_r = []
@@ -146,7 +154,7 @@ def update(self, s, r, s_):
         s = s_
         t += 1
 
-        if done or t >= EPISODE_TIME_THRESHOLD:
+        if done or t >= MAX_EP_STEPS:
             ep_rs_sum = sum(track_r)
 
             if 'running_reward' not in globals():
 
@@ -21,7 +21,8 @@
 
 
 class Actor(object):
-    def __init__(self, n_features, action_range, lr=0.0001):
+    def __init__(self, sess, n_features, action_range, lr=0.0001):
+        self.sess = sess
         with tf.name_scope('inputs'):
             self.state = tf.placeholder(tf.float32, [n_features, ], "state")
             state = tf.expand_dims(self.state, axis=0)
@@ -79,7 +80,8 @@ def choose_action(self, s):
 
 
 class Critic(object):
-    def __init__(self, n_features, lr=0.01):
+    def __init__(self, sess, n_features, lr=0.01):
+        self.sess = sess
         with tf.name_scope('inputs'):
             self.state = tf.placeholder(tf.float32, [n_features, ], "state")
             state = tf.expand_dims(self.state, axis=0)
@@ -118,53 +120,56 @@ def evaluate(self, s):
 
 
 OUTPUT_GRAPH = False
+MAX_EPISODE = 3000
 EPISODE_TIME_THRESHOLD = 300
 DISPLAY_REWARD_THRESHOLD = -550  # renders environment if total episode reward is greater then this threshold
 RENDER = False  # rendering wastes time
 GAMMA = 0.9
+LR_A = 0.001    # learning rate for actor
+LR_C = 0.01     # learning rate for critic
 
 env = gym.make('Pendulum-v0')
-# env.seed(1)  # reproducible
-
-actor = Actor(n_features=env.observation_space.shape[0], action_range=[env.action_space.low[0], env.action_space.high[0]], lr=0.001)
-critic = Critic(n_features=env.observation_space.shape[0], lr=0.002)
-
-with tf.Session() as sess:
-    if OUTPUT_GRAPH:
-        tf.summary.FileWriter("logs/", sess.graph)
-
-    actor.sess, critic.sess = sess, sess    # define the tf session
-    tf.global_variables_initializer().run()
-
-    for i_episode in range(3000):
-        observation = env.reset()
-        t = 0
-        ep_rs = []
-        while True:
-            # if RENDER:
-            env.render()
-            action, mu, sigma = actor.choose_action(observation)
-
-            observation_, reward, done, info = env.step(action)
-            reward /= 10
-            TD_target = reward + GAMMA * critic.evaluate(observation_)    # r + gamma * V_next
-            TD_eval = critic.evaluate(observation)    # V_now
-            TD_error = TD_target - TD_eval
-
-            actor.update(s=observation, a=action, adv=TD_error)
-            critic.update(s=observation, target=TD_target)
-
-            observation = observation_
-            t += 1
-            # print(reward)
-            ep_rs.append(reward)
-            if t > EPISODE_TIME_THRESHOLD:
-                ep_rs_sum = sum(ep_rs)
-                if 'running_reward' not in globals():
-                    running_reward = ep_rs_sum
-                else:
-                    running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
-                if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
-                print("episode:", i_episode, "  reward:", int(running_reward))
-                break
+env.seed(1)  # reproducible
+
+sess = tf.Session()
+
+actor = Actor(sess, n_features=env.observation_space.shape[0], action_range=[env.action_space.low[0], env.action_space.high[0]], lr=LR_A)
+critic = Critic(sess, n_features=env.observation_space.shape[0], lr=LR_C)
+
+sess.run(tf.global_variables_initializer())
+
+if OUTPUT_GRAPH:
+    tf.summary.FileWriter("logs/", sess.graph)
+
+for i_episode in range(MAX_EPISODE):
+    s = env.reset()
+    t = 0
+    ep_rs = []
+    while True:
+        # if RENDER:
+        env.render()
+        a, mu, sigma = actor.choose_action(s)
+
+        s_, r, done, info = env.step(a)
+        r /= 10
+        TD_target = r + GAMMA * critic.evaluate(s_)    # r + gamma * V_next
+        TD_eval = critic.evaluate(s)    # V_now
+        TD_error = TD_target - TD_eval
+
+        actor.update(s=s, a=a, adv=TD_error)
+        critic.update(s=s, target=TD_target)
+
+        s = s_
+        t += 1
+        # print(reward)
+        ep_rs.append(r)
+        if t > EPISODE_TIME_THRESHOLD:
+            ep_rs_sum = sum(ep_rs)
+            if 'running_reward' not in globals():
+                running_reward = ep_rs_sum
+            else:
+                running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
+            if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
+            print("episode:", i_episode, "  reward:", int(running_reward))
+            break