hbishandb
diff --git a/‎Reinforcement_learning_TUT/7_Policy_gradient_softmax/RL_brain.py‎
Lines changed: 2 additions & 0 deletions b/‎Reinforcement_learning_TUT/7_Policy_gradient_softmax/RL_brain.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Reinforcement_learning_TUT/8_Actor_Critic_Advantage/eligibility.py‎
Lines changed: 0 additions & 177 deletions b/‎Reinforcement_learning_TUT/8_Actor_Critic_Advantage/eligibility.py‎
Lines changed: 0 additions & 177 deletions
@@ -73,6 +73,8 @@ def _build_net(self):
             neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_act, labels=self.tf_acts)   # this is negative log of chosen action
             # or in this way:
             # neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1)
+
+            # to maximize total reward (log_p * R) is to minimize -(log_p * R), and the tf only have minimize(loss)
             loss = tf.reduce_mean(neg_log_prob * self.tf_vt)  # reward guided loss
 
         with tf.name_scope('train'):