From 7ffe94dcfd68d420a5af1da449cb82e0116e1dbd Mon Sep 17 00:00:00 2001 From: Onur Date: Fri, 8 Apr 2022 17:32:53 +0200 Subject: [PATCH] working bp version, tested with CMORE on a smaller context with 1 seed --- alr_envs/alr/__init__.py | 1 + alr_envs/alr/mujoco/beerpong/beerpong.py | 7 ++++++- .../alr/mujoco/beerpong/beerpong_reward_staged.py | 14 +++++++------- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index 7e7bca1..38abbf1 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -391,6 +391,7 @@ for _v in _versions: "duration": 1, "post_traj_time": 2, "policy_type": "motor", + # "weights_scale": 0.15, "weights_scale": 1, "zero_start": True, "zero_goal": False, diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py index 886b924..d885e78 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong.py @@ -10,6 +10,10 @@ from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward CUP_POS_MIN = np.array([-0.32, -2.2]) CUP_POS_MAX = np.array([0.32, -1.2]) +# smaller context space -> Easier task +# CUP_POS_MIN = np.array([-0.16, -2.2]) +# CUP_POS_MAX = np.array([0.16, -1.7]) + class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, @@ -36,7 +40,8 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): self.ball_site_id = 0 self.ball_id = 11 - self._release_step = 175 # time step of ball release + # self._release_step = 175 # time step of ball release + self._release_step = 130 # time step of ball release self.sim_time = 3 # seconds self.ep_length = 600 # based on 3 seconds with dt = 0.005 int(self.sim_time / self.dt) diff --git a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py index 6edc7ee..c9ed451 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py @@ -85,7 +85,7 @@ class BeerPongReward: # if not self.ball_table_contact: # self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id, # self.table_collision_id) - + # # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) # if env._steps == env.ep_length - 1 or self._is_collided: # min_dist = np.min(self.dists) @@ -115,7 +115,7 @@ class BeerPongReward: # crash = False # ################################################################################################################ - # ##################### Reward function which does not force to bounce once on the table (tanh) ################ + ##################### Reward function which does not force to bounce once on the table (tanh) ################ # self._check_contacts(env.sim) # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) # if env._steps == env.ep_length - 1 or self._is_collided: @@ -142,9 +142,9 @@ class BeerPongReward: # reward = - 1e-2 * action_cost # success = False # crash = False - # ################################################################################################################ + ################################################################################################################ - # ##################### Reward function which does not force to bounce once on the table (quad dist) ############ + # # ##################### Reward function which does not force to bounce once on the table (quad dist) ############ self._check_contacts(env.sim) self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) if env._steps == env.ep_length - 1 or self._is_collided: @@ -162,12 +162,12 @@ class BeerPongReward: reward = rew_offset - min_dist_coeff * min_dist**2 - final_dist_coeff * final_dist**2 - \ 1e-4*np.mean(action_cost) + # 1e-7*np.mean(action_cost) success = self.ball_in_cup - crash = self._is_collided else: - reward = - 1e-2 * action_cost + # reward = - 1e-2 * action_cost + reward = - 1e-4 * action_cost success = False - crash = False # ################################################################################################################ infos = {}