From 7fd8eadbe656b0c23cc6e438c6d50611a8fb6457 Mon Sep 17 00:00:00 2001 From: Matthias Plappert Date: Thu, 19 Sep 2019 13:46:59 -0700 Subject: [PATCH] Seperate reward and cost --- gym/envs/mujoco/ant.py | 6 ++++-- gym/envs/mujoco/half_cheetah.py | 6 +++--- gym/envs/mujoco/hopper.py | 5 +++-- gym/envs/mujoco/reacher.py | 6 +++--- gym/envs/mujoco/walker2d.py | 4 ++-- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/gym/envs/mujoco/ant.py b/gym/envs/mujoco/ant.py index 550fb645a55..64c852dc192 100644 --- a/gym/envs/mujoco/ant.py +++ b/gym/envs/mujoco/ant.py @@ -12,7 +12,7 @@ def step(self, a): self.do_simulation(a, self.frame_skip) xposafter = self.get_body_com("torso")[0] forward_reward = (xposafter - xposbefore)/self.dt - ctrl_cost = .5 * np.square(a).sum() + ctrl_cost = .1 * np.square(a).sum() contact_cost = 0.5 * 1e-3 * np.sum( np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))) survive_reward = 1.0 @@ -26,7 +26,9 @@ def step(self, a): reward_forward=forward_reward, reward_ctrl=-ctrl_cost, reward_contact=-contact_cost, - reward_survive=survive_reward) + reward_survive=survive_reward, + reward=forward_reward+survive_reward, + cost=ctrl_cost+contact_cost) def _get_obs(self): return np.concatenate([ diff --git a/gym/envs/mujoco/half_cheetah.py b/gym/envs/mujoco/half_cheetah.py index ea9761c5610..99e0fab9508 100644 --- a/gym/envs/mujoco/half_cheetah.py +++ b/gym/envs/mujoco/half_cheetah.py @@ -12,11 +12,11 @@ def step(self, action): self.do_simulation(action, self.frame_skip) xposafter = self.sim.data.qpos[0] ob = self._get_obs() - reward_ctrl = - 0.1 * np.square(action).sum() + cost_ctrl = 0.1 * np.square(action).sum() reward_run = (xposafter - xposbefore)/self.dt - reward = reward_ctrl + reward_run + reward = reward_run - cost_ctrl done = False - return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) + return ob, reward, done, dict(reward=reward_run, cost=cost_ctrl) def _get_obs(self): return np.concatenate([ diff --git a/gym/envs/mujoco/hopper.py b/gym/envs/mujoco/hopper.py index be826a409af..c445586cdbe 100644 --- a/gym/envs/mujoco/hopper.py +++ b/gym/envs/mujoco/hopper.py @@ -14,12 +14,13 @@ def step(self, a): alive_bonus = 1.0 reward = (posafter - posbefore) / self.dt reward += alive_bonus - reward -= 1e-3 * np.square(a).sum() + cost_ctrl = 0.1 * np.square(a).sum() + #reward -= 1e-3 * np.square(a).sum() s = self.state_vector() done = not (np.isfinite(s).all() and (np.abs(s[2:]) < 100).all() and (height > .7) and (abs(ang) < .2)) ob = self._get_obs() - return ob, reward, done, {} + return ob, reward, done, dict(reward=reward, cost=cost_ctrl) def _get_obs(self): return np.concatenate([ diff --git a/gym/envs/mujoco/reacher.py b/gym/envs/mujoco/reacher.py index dad343c5bbf..fcb97f76001 100644 --- a/gym/envs/mujoco/reacher.py +++ b/gym/envs/mujoco/reacher.py @@ -10,12 +10,12 @@ def __init__(self): def step(self, a): vec = self.get_body_com("fingertip")-self.get_body_com("target") reward_dist = - np.linalg.norm(vec) - reward_ctrl = - np.square(a).sum() - reward = reward_dist + reward_ctrl + cost_ctrl = 0.1 * np.square(a).sum() + reward = reward_dist - cost_ctrl self.do_simulation(a, self.frame_skip) ob = self._get_obs() done = False - return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl) + return ob, reward, done, dict(reward=reward_dist, cost=cost_ctrl) def viewer_setup(self): self.viewer.cam.trackbodyid = 0 diff --git a/gym/envs/mujoco/walker2d.py b/gym/envs/mujoco/walker2d.py index 805f2dd591a..79735db9cc8 100644 --- a/gym/envs/mujoco/walker2d.py +++ b/gym/envs/mujoco/walker2d.py @@ -15,11 +15,11 @@ def step(self, a): alive_bonus = 1.0 reward = ((posafter - posbefore) / self.dt) reward += alive_bonus - reward -= 1e-3 * np.square(a).sum() + cost_ctrl = 0.1 * np.square(a).sum() done = not (height > 0.8 and height < 2.0 and ang > -1.0 and ang < 1.0) ob = self._get_obs() - return ob, reward, done, {} + return ob, reward, done, dict(reward=reward, cost=cost_ctrl) def _get_obs(self): qpos = self.sim.data.qpos