working bp version, tested with CMORE on a smaller context with 1 seed

2022-04-08 17:32:53 +02:00 · 2022-04-08 17:32:53 +02:00 · 7ffe94dcfd
commit 7ffe94dcfd
parent eb7dd3a18f
3 changed files with 14 additions and 8 deletions
--- a/alr_envs/alr/init.py
+++ b/alr_envs/alr/init.py
@ -391,6 +391,7 @@ for _v in _versions:
                "duration": 1,
                "post_traj_time": 2,
                "policy_type": "motor",
+                # "weights_scale": 0.15,
                "weights_scale": 1,
                "zero_start": True,
                "zero_goal": False,
--- a/alr_envs/alr/mujoco/beerpong/beerpong.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong.py
@ -10,6 +10,10 @@ from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
 CUP_POS_MIN = np.array([-0.32, -2.2])
 CUP_POS_MAX = np.array([0.32, -1.2])

+# smaller context space -> Easier task
+# CUP_POS_MIN = np.array([-0.16, -2.2])
+# CUP_POS_MAX = np.array([0.16, -1.7])
+

 class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False,
@ -36,7 +40,8 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
        self.ball_site_id = 0
        self.ball_id = 11

-        self._release_step = 175  # time step of ball release
+        # self._release_step = 175  # time step of ball release
+        self._release_step = 130  # time step of ball release

        self.sim_time = 3  # seconds
        self.ep_length = 600  # based on 3 seconds with dt = 0.005 int(self.sim_time / self.dt)
--- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
@ -85,7 +85,7 @@ class BeerPongReward:
        # if not self.ball_table_contact:
        #     self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id,
        #                                                                        self.table_collision_id)
-
+        #
        # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
        # if env._steps == env.ep_length - 1 or self._is_collided:
        #     min_dist = np.min(self.dists)
@ -115,7 +115,7 @@ class BeerPongReward:
        #     crash = False
        # ################################################################################################################

-        # ##################### Reward function which does not force to bounce once on the table (tanh) ################
+        ##################### Reward function which does not force to bounce once on the table (tanh) ################
        # self._check_contacts(env.sim)
        # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
        # if env._steps == env.ep_length - 1 or self._is_collided:
@ -142,9 +142,9 @@ class BeerPongReward:
        #     reward = - 1e-2 * action_cost
        #     success = False
        #     crash = False
-        # ################################################################################################################
+        ################################################################################################################

-        # ##################### Reward function which does not force to bounce once on the table (quad dist) ############
+        # # ##################### Reward function which does not force to bounce once on the table (quad dist) ############
        self._check_contacts(env.sim)
        self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
        if env._steps == env.ep_length - 1 or self._is_collided:
@ -162,12 +162,12 @@ class BeerPongReward:

            reward = rew_offset - min_dist_coeff * min_dist**2 - final_dist_coeff * final_dist**2 - \
                     1e-4*np.mean(action_cost)
+                     # 1e-7*np.mean(action_cost)
            success = self.ball_in_cup
-            crash = self._is_collided
        else:
-            reward = - 1e-2 * action_cost
+            # reward = - 1e-2 * action_cost
+            reward = - 1e-4 * action_cost
            success = False
-            crash = False
        # ################################################################################################################

        infos = {}