From eb7dd3a18fc98f8007cfb22bc07f7ace0193031f Mon Sep 17 00:00:00 2001 From: Onur Date: Thu, 7 Apr 2022 18:49:44 +0200 Subject: [PATCH] fixing beerpong rewards --- alr_envs/alr/__init__.py | 28 +---- alr_envs/alr/mujoco/beerpong/beerpong.py | 18 +-- .../mujoco/beerpong/beerpong_reward_staged.py | 104 +++++++++++++++--- 3 files changed, 94 insertions(+), 56 deletions(-) diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index 931e3bb..7e7bca1 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -429,30 +429,4 @@ for _v, cd in enumerate(ctxt_dim): } } ) - ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) - -# register( -# id='TableTennisProMP-v2', -# entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', -# kwargs={ -# "name": "alr_envs:TableTennis2DCtxt-v1", -# "wrappers": [mujoco.table_tennis.MPWrapper], -# "mp_kwargs": { -# "num_dof": 7, -# "num_basis": 2, -# "duration": 1.25, -# "post_traj_time": 4.5, -# #"width": 0.01, -# #"off": 0.01, -# "policy_type": "motor", -# "weights_scale": 1.0, -# "zero_start": True, -# "zero_goal": False, -# "policy_kwargs": { -# "p_gains": 0.5*np.array([1.0, 4.0, 2.0, 4.0, 1.0, 4.0, 1.0]), -# "d_gains": 0.5*np.array([0.1, 0.4, 0.2, 0.4, 0.1, 0.4, 0.1]) -# } -# } -# } -# ) -# ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("TableTennisProMP-v2") + ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) \ No newline at end of file diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py index 99d1a23..886b924 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong.py @@ -7,21 +7,16 @@ from gym.envs.mujoco import MujocoEnv from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward -# CUP_POS_MIN = np.array([-0.32, -2.2]) -# CUP_POS_MAX = np.array([0.32, -1.2]) - -CUP_POS_MIN = np.array([-1.42, -4.05]) -CUP_POS_MAX = np.array([1.42, -1.25]) +CUP_POS_MIN = np.array([-0.32, -2.2]) +CUP_POS_MAX = np.array([0.32, -1.2]) class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): - if cup_goal_pos is None: - cup_goal_pos = [-0.3, -1.2, 0.840] - elif len(cup_goal_pos)==2: - cup_goal_pos = np.array(cup_goal_pos) - cup_goal_pos = np.insert(cup_goal_pos, 2, 0.80) + cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840]) + if cup_goal_pos.shape[0]==2: + cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840) self.cup_goal_pos = np.array(cup_goal_pos) self._steps = 0 @@ -52,7 +47,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): else: self.noise_std = 0 - reward_function = BeerPongReward self.reward_function = reward_function() @@ -94,7 +88,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): start_pos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy() self.set_state(start_pos, init_vel) if self.rndm_goal: - xy = np.random.uniform(CUP_POS_MIN, CUP_POS_MAX) + xy = self.np_random.uniform(CUP_POS_MIN, CUP_POS_MAX) xyz = np.zeros(3) xyz[:2] = xy xyz[-1] = 0.840 diff --git a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py index 40b181b..6edc7ee 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py @@ -54,6 +54,7 @@ class BeerPongReward: self.ball_table_contact = False self.ball_wall_contact = False self.ball_cup_contact = False + self.ball_in_cup = False self.noisy_bp = noisy self._t_min_final_dist = -1 @@ -80,39 +81,94 @@ class BeerPongReward: action_cost = np.sum(np.square(action)) self.action_costs.append(action_cost) + # ##################### Reward function which forces to bounce once on the table (tanh) ######################## + # if not self.ball_table_contact: + # self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id, + # self.table_collision_id) - if not self.ball_table_contact: - self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id, - self.table_collision_id) + # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) + # if env._steps == env.ep_length - 1 or self._is_collided: + # min_dist = np.min(self.dists) + # final_dist = self.dists_final[-1] + # + # ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id, + # self.cup_table_collision_id) + # + # # encourage bounce before falling into cup + # if not ball_in_cup: + # if not self.ball_table_contact: + # reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist)) + # else: + # reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist)) + # else: + # if not self.ball_table_contact: + # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1 + # else: + # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3 + # + # # reward = - 1 * cost - self.collision_penalty * int(self._is_collided) + # success = ball_in_cup + # crash = self._is_collided + # else: + # reward = - 1e-2 * action_cost + # success = False + # crash = False + # ################################################################################################################ + # ##################### Reward function which does not force to bounce once on the table (tanh) ################ + # self._check_contacts(env.sim) + # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) + # if env._steps == env.ep_length - 1 or self._is_collided: + # min_dist = np.min(self.dists) + # final_dist = self.dists_final[-1] + # + # # encourage bounce before falling into cup + # if not self.ball_in_cup: + # if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact: + # min_dist_coeff, final_dist_coeff, rew_offset = 0.2, 0.1, 0 + # # reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist)) + # else: + # min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, 0 + # # reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist)) + # else: + # min_dist_coeff, final_dist_coeff, rew_offset = 1, 2, 3 + # # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3 + # + # reward = final_dist_coeff * (1 - np.tanh(0.5 * final_dist)) + min_dist_coeff * (1 - np.tanh(0.5 * min_dist)) \ + # + rew_offset + # success = self.ball_in_cup + # crash = self._is_collided + # else: + # reward = - 1e-2 * action_cost + # success = False + # crash = False + # ################################################################################################################ + + # ##################### Reward function which does not force to bounce once on the table (quad dist) ############ + self._check_contacts(env.sim) self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) if env._steps == env.ep_length - 1 or self._is_collided: - min_dist = np.min(self.dists) final_dist = self.dists_final[-1] - ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id, - self.cup_table_collision_id) - # encourage bounce before falling into cup - if not ball_in_cup: - if not self.ball_table_contact: - reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist)) + if not self.ball_in_cup: + if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact: + min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -4 else: - reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist)) + min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -2 else: - if not self.ball_table_contact: - reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1 - else: - reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3 + min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, 0 - # reward = - 1 * cost - self.collision_penalty * int(self._is_collided) - success = ball_in_cup + reward = rew_offset - min_dist_coeff * min_dist**2 - final_dist_coeff * final_dist**2 - \ + 1e-4*np.mean(action_cost) + success = self.ball_in_cup crash = self._is_collided else: reward = - 1e-2 * action_cost success = False crash = False + # ################################################################################################################ infos = {} infos["success"] = success @@ -124,6 +180,20 @@ class BeerPongReward: return reward, infos + def _check_contacts(self, sim): + if not self.ball_table_contact: + self.ball_table_contact = self._check_collision_single_objects(sim, self.ball_collision_id, + self.table_collision_id) + if not self.ball_cup_contact: + self.ball_cup_contact = self._check_collision_with_set_of_objects(sim, self.ball_collision_id, + self.cup_collision_ids) + if not self.ball_wall_contact: + self.ball_wall_contact = self._check_collision_single_objects(sim, self.ball_collision_id, + self.wall_collision_id) + if not self.ball_in_cup: + self.ball_in_cup = self._check_collision_single_objects(sim, self.ball_collision_id, + self.cup_table_collision_id) + def _check_collision_single_objects(self, sim, id_1, id_2): for coni in range(0, sim.data.ncon): con = sim.data.contact[coni]