fixing beerpong rewards

2022-04-07 18:49:44 +02:00 · 2022-04-07 18:49:44 +02:00 · eb7dd3a18f
commit eb7dd3a18f
parent 04b6b314cf
3 changed files with 94 additions and 56 deletions
--- a/alr_envs/alr/init.py
+++ b/alr_envs/alr/init.py
@ -429,30 +429,4 @@ for _v, cd in enumerate(ctxt_dim):
            }
        }
    )
-    ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
-
-# register(
-#     id='TableTennisProMP-v2',
-#     entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper',
-#     kwargs={
-#         "name": "alr_envs:TableTennis2DCtxt-v1",
-#         "wrappers": [mujoco.table_tennis.MPWrapper],
-#         "mp_kwargs": {
-#             "num_dof": 7,
-#             "num_basis": 2,
-#             "duration": 1.25,
-#             "post_traj_time": 4.5,
-#             #"width": 0.01,
-#             #"off": 0.01,
-#             "policy_type": "motor",
-#             "weights_scale": 1.0,
-#             "zero_start": True,
-#             "zero_goal": False,
-#             "policy_kwargs": {
-#                 "p_gains": 0.5*np.array([1.0, 4.0, 2.0, 4.0, 1.0, 4.0, 1.0]),
-#                 "d_gains": 0.5*np.array([0.1, 0.4, 0.2, 0.4, 0.1, 0.4, 0.1])
-#             }
-#         }
-#     }
-# )
-# ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("TableTennisProMP-v2")
+    ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
--- a/alr_envs/alr/mujoco/beerpong/beerpong.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong.py
@ -7,21 +7,16 @@ from gym.envs.mujoco import MujocoEnv
 from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward


-# CUP_POS_MIN = np.array([-0.32, -2.2])
-# CUP_POS_MAX = np.array([0.32, -1.2])
-
-CUP_POS_MIN = np.array([-1.42, -4.05])
-CUP_POS_MAX = np.array([1.42, -1.25])
+CUP_POS_MIN = np.array([-0.32, -2.2])
+CUP_POS_MAX = np.array([0.32, -1.2])


 class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False,
                 rndm_goal=False, cup_goal_pos=None):
-        if cup_goal_pos is None:
-            cup_goal_pos = [-0.3, -1.2, 0.840]
-        elif len(cup_goal_pos)==2:
-            cup_goal_pos = np.array(cup_goal_pos)
-            cup_goal_pos = np.insert(cup_goal_pos, 2, 0.80)
+        cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840])
+        if cup_goal_pos.shape[0]==2:
+            cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840)
        self.cup_goal_pos = np.array(cup_goal_pos)

        self._steps = 0
@ -52,7 +47,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
        else:
            self.noise_std = 0

-
        reward_function = BeerPongReward
        self.reward_function = reward_function()

@ -94,7 +88,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
        start_pos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy()
        self.set_state(start_pos, init_vel)
        if self.rndm_goal:
-            xy = np.random.uniform(CUP_POS_MIN, CUP_POS_MAX)
+            xy = self.np_random.uniform(CUP_POS_MIN, CUP_POS_MAX)
            xyz = np.zeros(3)
            xyz[:2] = xy
            xyz[-1] = 0.840
--- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
@ -54,6 +54,7 @@ class BeerPongReward:
        self.ball_table_contact = False
        self.ball_wall_contact = False
        self.ball_cup_contact = False
+        self.ball_in_cup = False
        self.noisy_bp = noisy
        self._t_min_final_dist = -1

@ -80,39 +81,94 @@ class BeerPongReward:

        action_cost = np.sum(np.square(action))
        self.action_costs.append(action_cost)
+        # ##################### Reward function which forces to bounce once on the table (tanh) ########################
+        # if not self.ball_table_contact:
+        #     self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id,
+        #                                                                        self.table_collision_id)

-        if not self.ball_table_contact:
-            self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id,
-                                                                           self.table_collision_id)
+        # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
+        # if env._steps == env.ep_length - 1 or self._is_collided:
+        #     min_dist = np.min(self.dists)
+        #     final_dist = self.dists_final[-1]
+        #
+        #     ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id,
+        #                                                        self.cup_table_collision_id)
+        #
+        #     # encourage bounce before falling into cup
+        #     if not ball_in_cup:
+        #         if not self.ball_table_contact:
+        #             reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
+        #         else:
+        #             reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
+        #     else:
+        #         if not self.ball_table_contact:
+        #             reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1
+        #         else:
+        #             reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
+        #
+        #     # reward = - 1 * cost - self.collision_penalty * int(self._is_collided)
+        #     success = ball_in_cup
+        #     crash = self._is_collided
+        # else:
+        #     reward = - 1e-2 * action_cost
+        #     success = False
+        #     crash = False
+        # ################################################################################################################

+        # ##################### Reward function which does not force to bounce once on the table (tanh) ################
+        # self._check_contacts(env.sim)
+        # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
+        # if env._steps == env.ep_length - 1 or self._is_collided:
+        #     min_dist = np.min(self.dists)
+        #     final_dist = self.dists_final[-1]
+        #
+        #     # encourage bounce before falling into cup
+        #     if not self.ball_in_cup:
+        #         if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
+        #             min_dist_coeff, final_dist_coeff, rew_offset = 0.2, 0.1, 0
+        #             # reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
+        #         else:
+        #             min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, 0
+        #             # reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
+        #     else:
+        #         min_dist_coeff, final_dist_coeff, rew_offset = 1, 2, 3
+        #         # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
+        #
+        #     reward = final_dist_coeff * (1 - np.tanh(0.5 * final_dist)) + min_dist_coeff * (1 - np.tanh(0.5 * min_dist)) \
+        #              + rew_offset
+        #     success = self.ball_in_cup
+        #     crash = self._is_collided
+        # else:
+        #     reward = - 1e-2 * action_cost
+        #     success = False
+        #     crash = False
+        # ################################################################################################################
+
+        # ##################### Reward function which does not force to bounce once on the table (quad dist) ############
+        self._check_contacts(env.sim)
        self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
        if env._steps == env.ep_length - 1 or self._is_collided:
-
            min_dist = np.min(self.dists)
            final_dist = self.dists_final[-1]

-            ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id,
-                                                               self.cup_table_collision_id)
-
            # encourage bounce before falling into cup
-            if not ball_in_cup:
-                if not self.ball_table_contact:
-                    reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
+            if not self.ball_in_cup:
+                if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
+                    min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -4
                else:
-                    reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
+                    min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -2
            else:
-                if not self.ball_table_contact:
-                    reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1
-                else:
-                    reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
+                min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, 0

-            # reward = - 1 * cost - self.collision_penalty * int(self._is_collided)
-            success = ball_in_cup
+            reward = rew_offset - min_dist_coeff * min_dist**2 - final_dist_coeff * final_dist**2 - \
+                     1e-4*np.mean(action_cost)
+            success = self.ball_in_cup
            crash = self._is_collided
        else:
            reward = - 1e-2 * action_cost
            success = False
            crash = False
+        # ################################################################################################################

        infos = {}
        infos["success"] = success
@ -124,6 +180,20 @@ class BeerPongReward:

        return reward, infos

+    def _check_contacts(self, sim):
+        if not self.ball_table_contact:
+            self.ball_table_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
+                                                                           self.table_collision_id)
+        if not self.ball_cup_contact:
+            self.ball_cup_contact = self._check_collision_with_set_of_objects(sim, self.ball_collision_id,
+                                                                            self.cup_collision_ids)
+        if not self.ball_wall_contact:
+            self.ball_wall_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
+                                                                  self.wall_collision_id)
+        if not self.ball_in_cup:
+            self.ball_in_cup = self._check_collision_single_objects(sim, self.ball_collision_id,
+                                                                    self.cup_table_collision_id)
+
    def _check_collision_single_objects(self, sim, id_1, id_2):
        for coni in range(0, sim.data.ncon):
            con = sim.data.contact[coni]