fixing beerpong rewards

2022-04-07 18:49:44 +02:00 · 2022-04-07 18:49:44 +02:00 · eb7dd3a18f
commit eb7dd3a18f
parent 04b6b314cf
3 changed files with 94 additions and 56 deletions
--- a/alr_envs/alr/init.py
+++ b/alr_envs/alr/init.py
@ -430,29 +430,3 @@ for _v, cd in enumerate(ctxt_dim):
        }
    )
    ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
 # register(
 #     id='TableTennisProMP-v2',
 #     entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper',
 #     kwargs={
 #         "name": "alr_envs:TableTennis2DCtxt-v1",
 #         "wrappers": [mujoco.table_tennis.MPWrapper],
 #         "mp_kwargs": {
 #             "num_dof": 7,
 #             "num_basis": 2,
 #             "duration": 1.25,
 #             "post_traj_time": 4.5,
 #             #"width": 0.01,
 #             #"off": 0.01,
 #             "policy_type": "motor",
 #             "weights_scale": 1.0,
 #             "zero_start": True,
 #             "zero_goal": False,
 #             "policy_kwargs": {
 #                 "p_gains": 0.5*np.array([1.0, 4.0, 2.0, 4.0, 1.0, 4.0, 1.0]),
 #                 "d_gains": 0.5*np.array([0.1, 0.4, 0.2, 0.4, 0.1, 0.4, 0.1])
 #             }
 #         }
 #     }
 # )
 # ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("TableTennisProMP-v2")
--- a/alr_envs/alr/mujoco/beerpong/beerpong.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong.py
@ -7,21 +7,16 @@ from gym.envs.mujoco import MujocoEnv
 from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
-# CUP_POS_MIN = np.array([-0.32, -2.2])
+CUP_POS_MIN = np.array([-0.32, -2.2])
-# CUP_POS_MAX = np.array([0.32, -1.2])
+CUP_POS_MAX = np.array([0.32, -1.2])
 CUP_POS_MIN = np.array([-1.42, -4.05])
 CUP_POS_MAX = np.array([1.42, -1.25])
 class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False,
                 rndm_goal=False, cup_goal_pos=None):
-        if cup_goal_pos is None:
+        cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840])
-            cup_goal_pos = [-0.3, -1.2, 0.840]
+        if cup_goal_pos.shape[0]==2:
-        elif len(cup_goal_pos)==2:
+            cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840)
            cup_goal_pos = np.array(cup_goal_pos)
            cup_goal_pos = np.insert(cup_goal_pos, 2, 0.80)
        self.cup_goal_pos = np.array(cup_goal_pos)
        self._steps = 0
@ -52,7 +47,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
        else:
            self.noise_std = 0
        reward_function = BeerPongReward
        self.reward_function = reward_function()
@ -94,7 +88,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
        start_pos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy()
        self.set_state(start_pos, init_vel)
        if self.rndm_goal:
-            xy = np.random.uniform(CUP_POS_MIN, CUP_POS_MAX)
+            xy = self.np_random.uniform(CUP_POS_MIN, CUP_POS_MAX)
            xyz = np.zeros(3)
            xyz[:2] = xy
            xyz[-1] = 0.840
--- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
@ -54,6 +54,7 @@ class BeerPongReward:
        self.ball_table_contact = False
        self.ball_wall_contact = False
        self.ball_cup_contact = False
        self.ball_in_cup = False
        self.noisy_bp = noisy
        self._t_min_final_dist = -1
@ -80,39 +81,94 @@ class BeerPongReward:
        action_cost = np.sum(np.square(action))
        self.action_costs.append(action_cost)
        # ##################### Reward function which forces to bounce once on the table (tanh) ########################
        # if not self.ball_table_contact:
        #     self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id,
        #                                                                        self.table_collision_id)
-        if not self.ball_table_contact:
+        # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
-            self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id,
+        # if env._steps == env.ep_length - 1 or self._is_collided:
-                                                                           self.table_collision_id)
+        #     min_dist = np.min(self.dists)
        #     final_dist = self.dists_final[-1]
        #
        #     ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id,
        #                                                        self.cup_table_collision_id)
        #
        #     # encourage bounce before falling into cup
        #     if not ball_in_cup:
        #         if not self.ball_table_contact:
        #             reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
        #         else:
        #             reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
        #     else:
        #         if not self.ball_table_contact:
        #             reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1
        #         else:
        #             reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
        #
        #     # reward = - 1 * cost - self.collision_penalty * int(self._is_collided)
        #     success = ball_in_cup
        #     crash = self._is_collided
        # else:
        #     reward = - 1e-2 * action_cost
        #     success = False
        #     crash = False
        # ################################################################################################################
        # ##################### Reward function which does not force to bounce once on the table (tanh) ################
        # self._check_contacts(env.sim)
        # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
        # if env._steps == env.ep_length - 1 or self._is_collided:
        #     min_dist = np.min(self.dists)
        #     final_dist = self.dists_final[-1]
        #
        #     # encourage bounce before falling into cup
        #     if not self.ball_in_cup:
        #         if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
        #             min_dist_coeff, final_dist_coeff, rew_offset = 0.2, 0.1, 0
        #             # reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
        #         else:
        #             min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, 0
        #             # reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
        #     else:
        #         min_dist_coeff, final_dist_coeff, rew_offset = 1, 2, 3
        #         # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
        #
        #     reward = final_dist_coeff * (1 - np.tanh(0.5 * final_dist)) + min_dist_coeff * (1 - np.tanh(0.5 * min_dist)) \
        #              + rew_offset
        #     success = self.ball_in_cup
        #     crash = self._is_collided
        # else:
        #     reward = - 1e-2 * action_cost
        #     success = False
        #     crash = False
        # ################################################################################################################
        # ##################### Reward function which does not force to bounce once on the table (quad dist) ############
        self._check_contacts(env.sim)
        self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
        if env._steps == env.ep_length - 1 or self._is_collided:
            min_dist = np.min(self.dists)
            final_dist = self.dists_final[-1]
            ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id,
                                                               self.cup_table_collision_id)
            # encourage bounce before falling into cup
-            if not ball_in_cup:
+            if not self.ball_in_cup:
-                if not self.ball_table_contact:
+                if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
-                    reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
+                    min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -4
                else:
-                    reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
+                    min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -2
            else:
-                if not self.ball_table_contact:
+                min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, 0
                    reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1
                else:
                    reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
-            # reward = - 1 * cost - self.collision_penalty * int(self._is_collided)
+            reward = rew_offset - min_dist_coeff * min_dist**2 - final_dist_coeff * final_dist**2 - \
-            success = ball_in_cup
+                     1e-4*np.mean(action_cost)
            success = self.ball_in_cup
            crash = self._is_collided
        else:
            reward = - 1e-2 * action_cost
            success = False
            crash = False
        # ################################################################################################################
        infos = {}
        infos["success"] = success
@ -124,6 +180,20 @@ class BeerPongReward:
        return reward, infos
    def _check_contacts(self, sim):
        if not self.ball_table_contact:
            self.ball_table_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
                                                                           self.table_collision_id)
        if not self.ball_cup_contact:
            self.ball_cup_contact = self._check_collision_with_set_of_objects(sim, self.ball_collision_id,
                                                                            self.cup_collision_ids)
        if not self.ball_wall_contact:
            self.ball_wall_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
                                                                  self.wall_collision_id)
        if not self.ball_in_cup:
            self.ball_in_cup = self._check_collision_single_objects(sim, self.ball_collision_id,
                                                                    self.cup_table_collision_id)
    def _check_collision_single_objects(self, sim, id_1, id_2):
        for coni in range(0, sim.data.ncon):
            con = sim.data.contact[coni]