From fb9efb20e1dcb9c677e0e1fafa6f2ee761d4ed19 Mon Sep 17 00:00:00 2001 From: Dominik Roth Date: Thu, 24 Aug 2023 11:09:47 +0200 Subject: [PATCH] Updates the custom fancy envs to the versions used for the MP3 paper. (According to Hongyi only BoxPushing changed between current master and MP3 version) --- .../mujoco/box_pushing/box_pushing_env.py | 110 ++++++++++++++++-- .../envs/mujoco/box_pushing/mp_wrapper.py | 11 ++ 2 files changed, 110 insertions(+), 11 deletions(-) diff --git a/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py b/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py index 275bba1..7ecd214 100644 --- a/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py +++ b/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py @@ -26,7 +26,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): 3. time-spatial-depend sparse reward """ - def __init__(self, frame_skip: int = 10): + def __init__(self, frame_skip: int = 10, random_init: bool = False): utils.EzPickle.__init__(**locals()) self._steps = 0 self.init_qpos_box_pushing = np.array([0., 0., 0., -1.5, 0., 1.5, 0., 0., 0., 0.6, 0.45, 0.0, 1., 0., 0., 0.]) @@ -39,6 +39,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): self._desired_rod_quat = desired_rod_quat self._episode_energy = 0. + self.random_init = random_init MujocoEnv.__init__(self, model_path=os.path.join(os.path.dirname(__file__), "assets", "box_pushing.xml"), frame_skip=self.frame_skip, @@ -93,7 +94,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): def reset_model(self): # rest box to initial position self.set_state(self.init_qpos_box_pushing, self.init_qvel_box_pushing) - box_init_pos = np.array([0.4, 0.3, -0.01, 0.0, 0.0, 0.0, 1.0]) + box_init_pos = self.sample_context() if self.random_init else np.array([0.4, 0.3, -0.01, 0.0, 0.0, 0.0, 1.0]) self.data.joint("box_joint").qpos = box_init_pos # set target position @@ -160,6 +161,9 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): penalty -= v_coeff * abs(np.sum(q_dot_error[q_dot_error > 0.])) return penalty + def _get_box_vel(self): + return self.data.body("box_0").cvel.copy() + def get_body_jacp(self, name): id = mujoco.mj_name2id(self.model, 1, name) jacp = np.zeros((3, self.model.nv)) @@ -281,8 +285,8 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): return q class BoxPushingDense(BoxPushingEnvBase): - def __init__(self, frame_skip: int = 10): - super(BoxPushingDense, self).__init__(frame_skip=frame_skip) + def __init__(self, frame_skip: int = 10, random_init: bool = False): + super(BoxPushingDense, self).__init__(frame_skip=frame_skip, random_init=random_init) def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, rod_tip_pos, rod_quat, qpos, qvel, action): joint_penalty = self._joint_limit_violate_penalty(qpos, @@ -304,14 +308,14 @@ class BoxPushingDense(BoxPushingEnvBase): return reward class BoxPushingTemporalSparse(BoxPushingEnvBase): - def __init__(self, frame_skip: int = 10): - super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip) + def __init__(self, frame_skip: int = 10, random_init: bool = False): + super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip, random_init=random_init) def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, rod_tip_pos, rod_quat, qpos, qvel, action): reward = 0. joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True) - energy_cost = -0.0005 * np.sum(np.square(action)) + energy_cost = -0.02 * np.sum(np.square(action)) tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100) reward += joint_penalty + tcp_box_dist_reward + energy_cost rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat) @@ -327,20 +331,23 @@ class BoxPushingTemporalSparse(BoxPushingEnvBase): box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100 box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100 - reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + ep_end_joint_vel = -50. * np.linalg.norm(qvel) + + reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + ep_end_joint_vel return reward + class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase): - def __init__(self, frame_skip: int = 10): - super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip) + def __init__(self, frame_skip: int = 10, random_init: bool = False): + super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip, random_init=random_init) def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, rod_tip_pos, rod_quat, qpos, qvel, action): reward = 0. joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True) - energy_cost = -0.0005 * np.sum(np.square(action)) + energy_cost = -0.02 * np.sum(np.square(action)) tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100) reward += joint_penalty + tcp_box_dist_reward + energy_cost rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat) @@ -360,3 +367,84 @@ class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase): reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward return reward + +class BoxPushingTemporalSpatialSparse2(BoxPushingEnvBase): + + def __init__(self, frame_skip: int = 10): + super(BoxPushingTemporalSpatialSparse2, self).__init__(frame_skip=frame_skip) + + def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, + rod_tip_pos, rod_quat, qpos, qvel, action): + reward = 0. + joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True) + energy_cost = -0.0005 * np.sum(np.square(action)) + tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100) + + reward += joint_penalty + energy_cost + tcp_box_dist_reward + + rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat) + + if rod_inclined_angle > np.pi / 4: + reward -= rod_inclined_angle / (np.pi) + + if not episode_end: + return reward + + # Force the robot to stop at the end + reward += -50. * np.linalg.norm(qvel) + + box_goal_dist = np.linalg.norm(box_pos - target_pos) + + if box_goal_dist < 0.1: + box_goal_pos_dist_reward = np.clip(- 350. * box_goal_dist, -200, 0) + box_goal_rot_dist_reward = np.clip(- rotation_distance(box_quat, target_quat)/np.pi * 100., -100, 0) + reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + else: + reward -= 300. + + return reward + + +class BoxPushingBruceSparse(BoxPushingEnvBase): + def __init__(self, frame_skip: int = 10): + super(BoxPushingBruceSparse, self).__init__(frame_skip=frame_skip) + + def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, + rod_tip_pos, rod_quat, qpos, qvel, action): + reward = 0. + joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True) + energy_cost = -0.0005 * np.sum(np.square(action)) + reward += joint_penalty + energy_cost + + if not episode_end: + return reward + + box_goal_dist = np.linalg.norm(box_pos - target_pos) + + box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100 + box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100 + + reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + self._get_end_vel_penalty() + + return reward + + def _get_end_vel_penalty(self): + rot_coeff = 150. + pos_coeff = 150. + box_rot_pos_vel = self._get_box_vel() + box_rot_vel = box_rot_pos_vel[:3] + box_pos_vel = box_rot_pos_vel[3:] + return -rot_coeff * np.linalg.norm(box_rot_vel) - pos_coeff * np.linalg.norm(box_pos_vel) + + + +if __name__=="__main__": + import fancy_gym + env = fancy_gym.make("BoxPushingDenseProDMP-v0", seed=0) + env.reset() + for i in range(1000): + env.render(mode="human") + obs, rew, done, info = env.step(env.action_space.sample()) + if done: + env.reset() + # print(f"box_end_velocity: {info['box_end_vel']}") diff --git a/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py b/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py index 09b2d65..8da6855 100644 --- a/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py +++ b/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py @@ -10,6 +10,17 @@ class MPWrapper(RawInterfaceWrapper): # Random x goal + random init pos @property def context_mask(self): + if self.random_init: + return np.hstack([ + [True] * 7, # joints position + [False] * 7, # joints velocity + [True] * 3, # position of box + [True] * 4, # orientation of box + [True] * 3, # position of target + [True] * 4, # orientation of target + # [True] * 1, # time + ]) + return np.hstack([ [False] * 7, # joints position [False] * 7, # joints velocity