From 993df10fad75aa7be60f6e1c7a10568cb5720ec7 Mon Sep 17 00:00:00 2001 From: Fabian Date: Tue, 12 Jul 2022 14:18:01 +0200 Subject: [PATCH] hopper throw seeding fixed --- .../alr/mujoco/hopper_throw/hopper_throw.py | 21 +++++++++-------- .../hopper_throw/hopper_throw_in_basket.py | 23 ++++++++++--------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/alr_envs/alr/mujoco/hopper_throw/hopper_throw.py b/alr_envs/alr/mujoco/hopper_throw/hopper_throw.py index 7ae33d1..c2503c4 100644 --- a/alr_envs/alr/mujoco/hopper_throw/hopper_throw.py +++ b/alr_envs/alr/mujoco/hopper_throw/hopper_throw.py @@ -27,7 +27,7 @@ class ALRHopperThrowEnv(HopperEnv): healthy_z_range=(0.7, float('inf')), healthy_angle_range=(-float('inf'), float('inf')), reset_noise_scale=5e-3, - context = True, + context=True, exclude_current_positions_from_observation=True, max_episode_steps=250): xml_file = os.path.join(os.path.dirname(__file__), "assets", xml_file) @@ -40,10 +40,10 @@ class ALRHopperThrowEnv(HopperEnv): exclude_current_positions_from_observation) def step(self, action): - self.current_step += 1 self.do_simulation(action, self.frame_skip) - ball_pos_after = self.get_body_com("ball")[0] #abs(self.get_body_com("ball")[0]) # use x and y to get point and use euclid distance as reward? + ball_pos_after = self.get_body_com("ball")[ + 0] # abs(self.get_body_com("ball")[0]) # use x and y to get point and use euclid distance as reward? ball_pos_after_y = self.get_body_com("ball")[2] # done = self.done TODO We should use this, not sure why there is no other termination; ball_landed should be enough, because we only look at the throw itself? - Paul and Marc @@ -57,7 +57,7 @@ class ALRHopperThrowEnv(HopperEnv): if self.current_step >= self.max_episode_steps or done: distance_reward = -np.linalg.norm(ball_pos_after - self.goal) if self.context else \ - self._forward_reward_weight * ball_pos_after + self._forward_reward_weight * ball_pos_after healthy_reward = 0 if self.context else self.healthy_reward * self.current_step rewards = distance_reward + healthy_reward @@ -67,8 +67,8 @@ class ALRHopperThrowEnv(HopperEnv): info = { 'ball_pos': ball_pos_after, 'ball_pos_y': ball_pos_after_y, - '_steps' : self.current_step, - 'goal' : self.goal, + '_steps': self.current_step, + 'goal': self.goal, } return observation, reward, done, info @@ -78,7 +78,7 @@ class ALRHopperThrowEnv(HopperEnv): def reset(self): self.current_step = 0 - self.goal = self.goal = np.random.uniform(2.0, 6.0, 1) # 0.5 8.0 + self.goal = self.goal = self.np_random.uniform(2.0, 6.0, 1) # 0.5 8.0 return super().reset() # overwrite reset_model to make it deterministic @@ -86,14 +86,15 @@ class ALRHopperThrowEnv(HopperEnv): noise_low = -self._reset_noise_scale noise_high = self._reset_noise_scale - qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) - qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) self.set_state(qpos, qvel) observation = self._get_obs() return observation + if __name__ == '__main__': render_mode = "human" # "human" or "partial" or "final" env = ALRHopperThrowEnv() @@ -110,4 +111,4 @@ if __name__ == '__main__': print('After ', i, ' steps, done: ', d) env.reset() - env.close() \ No newline at end of file + env.close() diff --git a/alr_envs/alr/mujoco/hopper_throw/hopper_throw_in_basket.py b/alr_envs/alr/mujoco/hopper_throw/hopper_throw_in_basket.py index 74a5b21..6827bf8 100644 --- a/alr_envs/alr/mujoco/hopper_throw/hopper_throw_in_basket.py +++ b/alr_envs/alr/mujoco/hopper_throw/hopper_throw_in_basket.py @@ -3,7 +3,6 @@ from gym.envs.mujoco.hopper_v3 import HopperEnv import numpy as np - MAX_EPISODE_STEPS_HOPPERTHROWINBASKET = 250 @@ -33,7 +32,7 @@ class ALRHopperThrowInBasketEnv(HopperEnv): context=True, penalty=0.0, exclude_current_positions_from_observation=True, - max_episode_steps = 250): + max_episode_steps=250): self.hit_basket_reward = hit_basket_reward self.current_step = 0 self.max_episode_steps = max_episode_steps @@ -57,7 +56,8 @@ class ALRHopperThrowInBasketEnv(HopperEnv): basket_center = (basket_pos[0] + 0.5, basket_pos[1], basket_pos[2]) is_in_basket_x = ball_pos[0] >= basket_pos[0] and ball_pos[0] <= basket_pos[0] + self.basket_size - is_in_basket_y = ball_pos[1] >= basket_pos[1] - (self.basket_size/2) and ball_pos[1] <= basket_pos[1] + (self.basket_size/2) + is_in_basket_y = ball_pos[1] >= basket_pos[1] - (self.basket_size / 2) and ball_pos[1] <= basket_pos[1] + ( + self.basket_size / 2) is_in_basket_z = ball_pos[2] < 0.1 is_in_basket = is_in_basket_x and is_in_basket_y and is_in_basket_z if is_in_basket: self.ball_in_basket = True @@ -77,15 +77,16 @@ class ALRHopperThrowInBasketEnv(HopperEnv): if not self.context: rewards += self.hit_basket_reward else: - dist = np.linalg.norm(ball_pos-basket_center) + dist = np.linalg.norm(ball_pos - basket_center) if self.context: rewards = -10 * dist else: - rewards -= (dist*dist) + rewards -= (dist * dist) else: # penalty not needed - rewards += ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0 #too much of a penalty? - + rewards += ((action[ + :2] > 0) * self.penalty).sum() if self.current_step < 10 else 0 # too much of a penalty? + observation = self._get_obs() reward = rewards - costs info = { @@ -106,7 +107,7 @@ class ALRHopperThrowInBasketEnv(HopperEnv): self.ball_in_basket = False if self.context: basket_id = self.sim.model.body_name2id("basket_ground") - self.basket_x = np.random.uniform(3, 7, 1) + self.basket_x = self.np_random.uniform(3, 7, 1) self.sim.model.body_pos[basket_id] = [self.basket_x, 0, 0] return super().reset() @@ -115,8 +116,8 @@ class ALRHopperThrowInBasketEnv(HopperEnv): noise_low = -self._reset_noise_scale noise_high = self._reset_noise_scale - qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) - qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) self.set_state(qpos, qvel) @@ -140,4 +141,4 @@ if __name__ == '__main__': print('After ', i, ' steps, done: ', d) env.reset() - env.close() \ No newline at end of file + env.close()