From fb9efb20e1dcb9c677e0e1fafa6f2ee761d4ed19 Mon Sep 17 00:00:00 2001
From: Dominik Roth <dominik.roth.dev@gmail.com>
Date: Thu, 24 Aug 2023 11:09:47 +0200
Subject: [PATCH] Updates the custom fancy envs to the versions used for the
 MP3 paper. (According to Hongyi only BoxPushing changed between current
 master and MP3 version)

---
 .../mujoco/box_pushing/box_pushing_env.py     | 110 ++++++++++++++++--
 .../envs/mujoco/box_pushing/mp_wrapper.py     |  11 ++
 2 files changed, 110 insertions(+), 11 deletions(-)

diff --git a/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py b/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py
index 275bba1..7ecd214 100644
--- a/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py
+++ b/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py
@@ -26,7 +26,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
     3. time-spatial-depend sparse reward
     """
 
-    def __init__(self, frame_skip: int = 10):
+    def __init__(self, frame_skip: int = 10, random_init: bool = False):
         utils.EzPickle.__init__(**locals())
         self._steps = 0
         self.init_qpos_box_pushing = np.array([0., 0., 0., -1.5, 0., 1.5, 0., 0., 0., 0.6, 0.45, 0.0, 1., 0., 0., 0.])
@@ -39,6 +39,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
         self._desired_rod_quat = desired_rod_quat
 
         self._episode_energy = 0.
+        self.random_init = random_init
         MujocoEnv.__init__(self,
                            model_path=os.path.join(os.path.dirname(__file__), "assets", "box_pushing.xml"),
                            frame_skip=self.frame_skip,
@@ -93,7 +94,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
     def reset_model(self):
         # rest box to initial position
         self.set_state(self.init_qpos_box_pushing, self.init_qvel_box_pushing)
-        box_init_pos = np.array([0.4, 0.3, -0.01, 0.0, 0.0, 0.0, 1.0])
+        box_init_pos = self.sample_context() if self.random_init else np.array([0.4, 0.3, -0.01, 0.0, 0.0, 0.0, 1.0])
         self.data.joint("box_joint").qpos = box_init_pos
 
         # set target position
@@ -160,6 +161,9 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
             penalty -= v_coeff * abs(np.sum(q_dot_error[q_dot_error > 0.]))
         return penalty
 
+    def _get_box_vel(self):
+        return self.data.body("box_0").cvel.copy()
+
     def get_body_jacp(self, name):
         id = mujoco.mj_name2id(self.model, 1, name)
         jacp = np.zeros((3, self.model.nv))
@@ -281,8 +285,8 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
         return q
 
 class BoxPushingDense(BoxPushingEnvBase):
-    def __init__(self, frame_skip: int = 10):
-        super(BoxPushingDense, self).__init__(frame_skip=frame_skip)
+    def __init__(self, frame_skip: int = 10, random_init: bool = False):
+        super(BoxPushingDense, self).__init__(frame_skip=frame_skip, random_init=random_init)
     def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
                     rod_tip_pos, rod_quat, qpos, qvel, action):
         joint_penalty = self._joint_limit_violate_penalty(qpos,
@@ -304,14 +308,14 @@ class BoxPushingDense(BoxPushingEnvBase):
         return reward
 
 class BoxPushingTemporalSparse(BoxPushingEnvBase):
-    def __init__(self, frame_skip: int = 10):
-        super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip)
+    def __init__(self, frame_skip: int = 10, random_init: bool = False):
+        super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip, random_init=random_init)
 
     def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
                     rod_tip_pos, rod_quat, qpos, qvel, action):
         reward = 0.
         joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
-        energy_cost = -0.0005 * np.sum(np.square(action))
+        energy_cost = -0.02 * np.sum(np.square(action))
         tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
         reward += joint_penalty + tcp_box_dist_reward + energy_cost
         rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
@@ -327,20 +331,23 @@ class BoxPushingTemporalSparse(BoxPushingEnvBase):
         box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100
         box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100
 
-        reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
+        ep_end_joint_vel = -50. * np.linalg.norm(qvel)
+
+        reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + ep_end_joint_vel
 
         return reward
 
+
 class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase):
 
-    def __init__(self, frame_skip: int = 10):
-        super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip)
+    def __init__(self, frame_skip: int = 10, random_init: bool = False):
+        super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip, random_init=random_init)
 
     def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
                     rod_tip_pos, rod_quat, qpos, qvel, action):
         reward = 0.
         joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
-        energy_cost = -0.0005 * np.sum(np.square(action))
+        energy_cost = -0.02 * np.sum(np.square(action))
         tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
         reward += joint_penalty + tcp_box_dist_reward + energy_cost
         rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
@@ -360,3 +367,84 @@ class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase):
             reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
 
         return reward
+
+class BoxPushingTemporalSpatialSparse2(BoxPushingEnvBase):
+
+    def __init__(self, frame_skip: int = 10):
+        super(BoxPushingTemporalSpatialSparse2, self).__init__(frame_skip=frame_skip)
+
+    def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
+                    rod_tip_pos, rod_quat, qpos, qvel, action):
+        reward = 0.
+        joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
+        energy_cost = -0.0005 * np.sum(np.square(action))
+        tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
+
+        reward += joint_penalty + energy_cost + tcp_box_dist_reward
+
+        rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
+
+        if rod_inclined_angle > np.pi / 4:
+            reward -= rod_inclined_angle / (np.pi)
+
+        if not episode_end:
+            return reward
+
+        # Force the robot to stop at the end
+        reward += -50. * np.linalg.norm(qvel)
+
+        box_goal_dist = np.linalg.norm(box_pos - target_pos)
+
+        if box_goal_dist < 0.1:
+            box_goal_pos_dist_reward = np.clip(- 350. * box_goal_dist, -200, 0)
+            box_goal_rot_dist_reward = np.clip(- rotation_distance(box_quat, target_quat)/np.pi * 100., -100, 0)
+            reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
+        else:
+            reward -= 300.
+
+        return reward
+
+
+class BoxPushingBruceSparse(BoxPushingEnvBase):
+    def __init__(self, frame_skip: int = 10):
+        super(BoxPushingBruceSparse, self).__init__(frame_skip=frame_skip)
+
+    def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
+                    rod_tip_pos, rod_quat, qpos, qvel, action):
+        reward = 0.
+        joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
+        energy_cost = -0.0005 * np.sum(np.square(action))
+        reward += joint_penalty + energy_cost
+
+        if not episode_end:
+            return reward
+
+        box_goal_dist = np.linalg.norm(box_pos - target_pos)
+
+        box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100
+        box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100
+
+        reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + self._get_end_vel_penalty()
+
+        return reward
+
+    def _get_end_vel_penalty(self):
+        rot_coeff = 150.
+        pos_coeff = 150.
+        box_rot_pos_vel = self._get_box_vel()
+        box_rot_vel = box_rot_pos_vel[:3]
+        box_pos_vel = box_rot_pos_vel[3:]
+        return -rot_coeff * np.linalg.norm(box_rot_vel) - pos_coeff * np.linalg.norm(box_pos_vel)
+
+
+
+if __name__=="__main__":
+    import fancy_gym
+    env = fancy_gym.make("BoxPushingDenseProDMP-v0", seed=0)
+    env.reset()
+    for i in range(1000):
+        env.render(mode="human")
+        obs, rew, done, info = env.step(env.action_space.sample())
+        if done:
+            env.reset()
+            # print(f"box_end_velocity: {info['box_end_vel']}")
diff --git a/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py b/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py
index 09b2d65..8da6855 100644
--- a/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py
+++ b/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py
@@ -10,6 +10,17 @@ class MPWrapper(RawInterfaceWrapper):
     # Random x goal + random init pos
     @property
     def context_mask(self):
+        if self.random_init:
+            return np.hstack([
+                [True] * 7,  # joints position
+                [False] * 7,  # joints velocity
+                [True] * 3,  # position of box
+                [True] * 4,  # orientation of box
+                [True] * 3,  # position of target
+                [True] * 4,  # orientation of target
+                # [True] * 1,  # time
+            ])
+
         return np.hstack([
             [False] * 7,  # joints position
             [False] * 7,  # joints velocity