From bd7e811a645eadc1b21611887e942e120cdd7033 Mon Sep 17 00:00:00 2001 From: "hongyi.zhou" Date: Mon, 3 Jul 2023 17:19:41 +0200 Subject: [PATCH 1/9] fix tau bound and init bound bug --- fancy_gym/black_box/black_box_wrapper.py | 14 ++++++++++++-- fancy_gym/black_box/raw_interface_wrapper.py | 10 ++++++++-- fancy_gym/envs/__init__.py | 4 ++-- fancy_gym/envs/mujoco/table_tennis/mp_wrapper.py | 9 +++++---- .../envs/mujoco/table_tennis/table_tennis_env.py | 10 +++++----- 5 files changed, 32 insertions(+), 15 deletions(-) diff --git a/fancy_gym/black_box/black_box_wrapper.py b/fancy_gym/black_box/black_box_wrapper.py index 0c108d7..7c33428 100644 --- a/fancy_gym/black_box/black_box_wrapper.py +++ b/fancy_gym/black_box/black_box_wrapper.py @@ -55,6 +55,14 @@ class BlackBoxWrapper(gym.ObservationWrapper): # self.traj_gen.set_mp_times(self.time_steps) self.traj_gen.set_duration(self.duration, self.dt) + # check + self.tau_bound = [-np.inf, np.inf] + self.delay_bound = [-np.inf, np.inf] + if hasattr(self.traj_gen.phase_gn, "tau_bound"): + self.tau_bound = self.traj_gen.phase_gn.tau_bound + if hasattr(self.traj_gen.phase_gn, "delay_bound"): + self.delay_bound = self.traj_gen.phase_gn.delay_bound + # reward computation self.reward_aggregation = reward_aggregation @@ -139,7 +147,8 @@ class BlackBoxWrapper(gym.ObservationWrapper): position, velocity = self.get_trajectory(action) position, velocity = self.env.set_episode_arguments(action, position, velocity) - traj_is_valid, position, velocity = self.env.preprocessing_and_validity_callback(action, position, velocity) + traj_is_valid, position, velocity = self.env.preprocessing_and_validity_callback(action, position, velocity, + self.tau_bound, self.delay_bound) trajectory_length = len(position) rewards = np.zeros(shape=(trajectory_length,)) @@ -153,7 +162,8 @@ class BlackBoxWrapper(gym.ObservationWrapper): if not traj_is_valid: obs, trajectory_return, done, infos = self.env.invalid_traj_callback(action, position, velocity, - self.return_context_observation) + self.return_context_observation, + self.tau_bound, self.delay_bound) return self.observation(obs), trajectory_return, done, infos self.plan_steps += 1 diff --git a/fancy_gym/black_box/raw_interface_wrapper.py b/fancy_gym/black_box/raw_interface_wrapper.py index 7647924..c8f7273 100644 --- a/fancy_gym/black_box/raw_interface_wrapper.py +++ b/fancy_gym/black_box/raw_interface_wrapper.py @@ -52,7 +52,8 @@ class RawInterfaceWrapper(gym.Wrapper): """ return self.env.dt - def preprocessing_and_validity_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray) \ + def preprocessing_and_validity_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray, + tau_bound: list = None, delay_bound: list = None ) \ -> Tuple[bool, np.ndarray, np.ndarray]: """ Used to preprocess the action and check if the desired trajectory is valid. @@ -61,6 +62,8 @@ class RawInterfaceWrapper(gym.Wrapper): specified, else only traj_gen parameters pos_traj: a vector instance of the raw position trajectory vel_traj: a vector instance of the raw velocity trajectory + tau_bound: a list of two elements, the lower and upper bound of the trajectory length scaling factor + delay_bound: a list of two elements, the lower and upper bound of the time to wait before execute Returns: validity flag: bool, True if the raw trajectory is valid, False if not pos_traj: a vector instance of the preprocessed position trajectory @@ -97,7 +100,8 @@ class RawInterfaceWrapper(gym.Wrapper): """ return True - def invalid_traj_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray) -> Tuple[np.ndarray, float, bool, dict]: + def invalid_traj_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray, + tau_bound: list, delay_bound: list) -> Tuple[np.ndarray, float, bool, dict]: """ Used to return a artificial return from the env if the desired trajectory is invalid. Args: @@ -105,6 +109,8 @@ class RawInterfaceWrapper(gym.Wrapper): specified, else only traj_gen parameters pos_traj: a vector instance of the raw position trajectory vel_traj: a vector instance of the raw velocity trajectory + tau_bound: a list of two elements, the lower and upper bound of the trajectory length scaling factor + delay_bound: a list of two elements, the lower and upper bound of the time to wait before execute Returns: obs: artificial observation if the trajectory is invalid, by default a zero vector reward: artificial reward if the trajectory is invalid, by default 0 diff --git a/fancy_gym/envs/__init__.py b/fancy_gym/envs/__init__.py index c23e879..b5bc154 100644 --- a/fancy_gym/envs/__init__.py +++ b/fancy_gym/envs/__init__.py @@ -557,8 +557,8 @@ for _v in _versions: kwargs_dict_tt_promp['name'] = _v kwargs_dict_tt_promp['controller_kwargs']['p_gains'] = 0.5 * np.array([1.0, 4.0, 2.0, 4.0, 1.0, 4.0, 1.0]) kwargs_dict_tt_promp['controller_kwargs']['d_gains'] = 0.5 * np.array([0.1, 0.4, 0.2, 0.4, 0.1, 0.4, 0.1]) - kwargs_dict_tt_promp['phase_generator_kwargs']['learn_tau'] = False - kwargs_dict_tt_promp['phase_generator_kwargs']['learn_delay'] = False + kwargs_dict_tt_promp['phase_generator_kwargs']['learn_tau'] = True + kwargs_dict_tt_promp['phase_generator_kwargs']['learn_delay'] = True kwargs_dict_tt_promp['phase_generator_kwargs']['tau_bound'] = [0.8, 1.5] kwargs_dict_tt_promp['phase_generator_kwargs']['delay_bound'] = [0.05, 0.15] kwargs_dict_tt_promp['basis_generator_kwargs']['num_basis'] = 3 diff --git a/fancy_gym/envs/mujoco/table_tennis/mp_wrapper.py b/fancy_gym/envs/mujoco/table_tennis/mp_wrapper.py index e33ed6c..3370047 100644 --- a/fancy_gym/envs/mujoco/table_tennis/mp_wrapper.py +++ b/fancy_gym/envs/mujoco/table_tennis/mp_wrapper.py @@ -29,15 +29,16 @@ class TT_MPWrapper(RawInterfaceWrapper): def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: return self.data.qvel[:7].copy() - def preprocessing_and_validity_callback(self, action, pos_traj, vel_traj): - return self.check_traj_validity(action, pos_traj, vel_traj) + def preprocessing_and_validity_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray, + tau_bound: list, delay_bound:list): + return self.check_traj_validity(action, pos_traj, vel_traj, tau_bound, delay_bound) def set_episode_arguments(self, action, pos_traj, vel_traj): return pos_traj, vel_traj def invalid_traj_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray, - return_contextual_obs: bool) -> Tuple[np.ndarray, float, bool, dict]: - return self.get_invalid_traj_step_return(action, pos_traj, return_contextual_obs) + return_contextual_obs: bool, tau_bound:list, delay_bound:list) -> Tuple[np.ndarray, float, bool, dict]: + return self.get_invalid_traj_step_return(action, pos_traj, return_contextual_obs, tau_bound, delay_bound) class TTVelObs_MPWrapper(TT_MPWrapper): diff --git a/fancy_gym/envs/mujoco/table_tennis/table_tennis_env.py b/fancy_gym/envs/mujoco/table_tennis/table_tennis_env.py index 7fb5e9f..3f86463 100644 --- a/fancy_gym/envs/mujoco/table_tennis/table_tennis_env.py +++ b/fancy_gym/envs/mujoco/table_tennis/table_tennis_env.py @@ -5,7 +5,7 @@ from gym import utils, spaces from gym.envs.mujoco import MujocoEnv from fancy_gym.envs.mujoco.table_tennis.table_tennis_utils import is_init_state_valid, magnus_force -from fancy_gym.envs.mujoco.table_tennis.table_tennis_utils import jnt_pos_low, jnt_pos_high, delay_bound, tau_bound +from fancy_gym.envs.mujoco.table_tennis.table_tennis_utils import jnt_pos_low, jnt_pos_high import mujoco @@ -225,7 +225,7 @@ class TableTennisEnv(MujocoEnv, utils.EzPickle): init_ball_state = self._generate_random_ball(random_pos=random_pos, random_vel=random_vel) return init_ball_state - def _get_traj_invalid_penalty(self, action, pos_traj): + def _get_traj_invalid_penalty(self, action, pos_traj, tau_bound, delay_bound): tau_invalid_penalty = 3 * (np.max([0, action[0] - tau_bound[1]]) + np.max([0, tau_bound[0] - action[0]])) delay_invalid_penalty = 3 * (np.max([0, action[1] - delay_bound[1]]) + np.max([0, delay_bound[0] - action[1]])) violate_high_bound_error = np.mean(np.maximum(pos_traj - jnt_pos_high, 0)) @@ -234,9 +234,9 @@ class TableTennisEnv(MujocoEnv, utils.EzPickle): violate_high_bound_error + violate_low_bound_error return -invalid_penalty - def get_invalid_traj_step_return(self, action, pos_traj, contextual_obs): + def get_invalid_traj_step_return(self, action, pos_traj, contextual_obs, tau_bound, delay_bound): obs = self._get_obs() if contextual_obs else np.concatenate([self._get_obs(), np.array([0])]) # 0 for invalid traj - penalty = self._get_traj_invalid_penalty(action, pos_traj) + penalty = self._get_traj_invalid_penalty(action, pos_traj, tau_bound, delay_bound) return obs, penalty, True, { "hit_ball": [False], "ball_returned_success": [False], @@ -247,7 +247,7 @@ class TableTennisEnv(MujocoEnv, utils.EzPickle): } @staticmethod - def check_traj_validity(action, pos_traj, vel_traj): + def check_traj_validity(action, pos_traj, vel_traj, tau_bound, delay_bound): time_invalid = action[0] > tau_bound[1] or action[0] < tau_bound[0] \ or action[1] > delay_bound[1] or action[1] < delay_bound[0] if time_invalid or np.any(pos_traj > jnt_pos_high) or np.any(pos_traj < jnt_pos_low): From a3df09379e20f33162e667131ed5118944f7618d Mon Sep 17 00:00:00 2001 From: Leo Kraft <56594417+leokraft@users.noreply.github.com> Date: Thu, 13 Jul 2023 19:18:43 +0200 Subject: [PATCH 2/9] Fix wrong argument order in tracking controller call --- fancy_gym/examples/pd_control_gain_tuning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fancy_gym/examples/pd_control_gain_tuning.py b/fancy_gym/examples/pd_control_gain_tuning.py index 407bfa1..71a3ba4 100644 --- a/fancy_gym/examples/pd_control_gain_tuning.py +++ b/fancy_gym/examples/pd_control_gain_tuning.py @@ -31,8 +31,8 @@ ax = fig.add_subplot(1, 1, 1) img = ax.imshow(env.env.render(mode="rgb_array")) fig.show() -for t, pos_vel in enumerate(zip(pos, vel)): - actions = env.tracking_controller.get_action(pos_vel[0], pos_vel[1], env.current_vel, env.current_pos) +for t, (des_pos, des_vel) in enumerate(zip(pos, vel)): + actions = env.tracking_controller.get_action(des_pos, des_vel, env.current_pos, env.current_vel) actions = np.clip(actions, env.env.action_space.low, env.env.action_space.high) _, _, _, _ = env.env.step(actions) if t % 15 == 0: From fb9efb20e1dcb9c677e0e1fafa6f2ee761d4ed19 Mon Sep 17 00:00:00 2001 From: Dominik Roth Date: Thu, 24 Aug 2023 11:09:47 +0200 Subject: [PATCH 3/9] Updates the custom fancy envs to the versions used for the MP3 paper. (According to Hongyi only BoxPushing changed between current master and MP3 version) --- .../mujoco/box_pushing/box_pushing_env.py | 110 ++++++++++++++++-- .../envs/mujoco/box_pushing/mp_wrapper.py | 11 ++ 2 files changed, 110 insertions(+), 11 deletions(-) diff --git a/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py b/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py index 275bba1..7ecd214 100644 --- a/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py +++ b/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py @@ -26,7 +26,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): 3. time-spatial-depend sparse reward """ - def __init__(self, frame_skip: int = 10): + def __init__(self, frame_skip: int = 10, random_init: bool = False): utils.EzPickle.__init__(**locals()) self._steps = 0 self.init_qpos_box_pushing = np.array([0., 0., 0., -1.5, 0., 1.5, 0., 0., 0., 0.6, 0.45, 0.0, 1., 0., 0., 0.]) @@ -39,6 +39,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): self._desired_rod_quat = desired_rod_quat self._episode_energy = 0. + self.random_init = random_init MujocoEnv.__init__(self, model_path=os.path.join(os.path.dirname(__file__), "assets", "box_pushing.xml"), frame_skip=self.frame_skip, @@ -93,7 +94,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): def reset_model(self): # rest box to initial position self.set_state(self.init_qpos_box_pushing, self.init_qvel_box_pushing) - box_init_pos = np.array([0.4, 0.3, -0.01, 0.0, 0.0, 0.0, 1.0]) + box_init_pos = self.sample_context() if self.random_init else np.array([0.4, 0.3, -0.01, 0.0, 0.0, 0.0, 1.0]) self.data.joint("box_joint").qpos = box_init_pos # set target position @@ -160,6 +161,9 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): penalty -= v_coeff * abs(np.sum(q_dot_error[q_dot_error > 0.])) return penalty + def _get_box_vel(self): + return self.data.body("box_0").cvel.copy() + def get_body_jacp(self, name): id = mujoco.mj_name2id(self.model, 1, name) jacp = np.zeros((3, self.model.nv)) @@ -281,8 +285,8 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): return q class BoxPushingDense(BoxPushingEnvBase): - def __init__(self, frame_skip: int = 10): - super(BoxPushingDense, self).__init__(frame_skip=frame_skip) + def __init__(self, frame_skip: int = 10, random_init: bool = False): + super(BoxPushingDense, self).__init__(frame_skip=frame_skip, random_init=random_init) def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, rod_tip_pos, rod_quat, qpos, qvel, action): joint_penalty = self._joint_limit_violate_penalty(qpos, @@ -304,14 +308,14 @@ class BoxPushingDense(BoxPushingEnvBase): return reward class BoxPushingTemporalSparse(BoxPushingEnvBase): - def __init__(self, frame_skip: int = 10): - super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip) + def __init__(self, frame_skip: int = 10, random_init: bool = False): + super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip, random_init=random_init) def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, rod_tip_pos, rod_quat, qpos, qvel, action): reward = 0. joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True) - energy_cost = -0.0005 * np.sum(np.square(action)) + energy_cost = -0.02 * np.sum(np.square(action)) tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100) reward += joint_penalty + tcp_box_dist_reward + energy_cost rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat) @@ -327,20 +331,23 @@ class BoxPushingTemporalSparse(BoxPushingEnvBase): box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100 box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100 - reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + ep_end_joint_vel = -50. * np.linalg.norm(qvel) + + reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + ep_end_joint_vel return reward + class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase): - def __init__(self, frame_skip: int = 10): - super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip) + def __init__(self, frame_skip: int = 10, random_init: bool = False): + super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip, random_init=random_init) def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, rod_tip_pos, rod_quat, qpos, qvel, action): reward = 0. joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True) - energy_cost = -0.0005 * np.sum(np.square(action)) + energy_cost = -0.02 * np.sum(np.square(action)) tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100) reward += joint_penalty + tcp_box_dist_reward + energy_cost rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat) @@ -360,3 +367,84 @@ class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase): reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward return reward + +class BoxPushingTemporalSpatialSparse2(BoxPushingEnvBase): + + def __init__(self, frame_skip: int = 10): + super(BoxPushingTemporalSpatialSparse2, self).__init__(frame_skip=frame_skip) + + def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, + rod_tip_pos, rod_quat, qpos, qvel, action): + reward = 0. + joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True) + energy_cost = -0.0005 * np.sum(np.square(action)) + tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100) + + reward += joint_penalty + energy_cost + tcp_box_dist_reward + + rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat) + + if rod_inclined_angle > np.pi / 4: + reward -= rod_inclined_angle / (np.pi) + + if not episode_end: + return reward + + # Force the robot to stop at the end + reward += -50. * np.linalg.norm(qvel) + + box_goal_dist = np.linalg.norm(box_pos - target_pos) + + if box_goal_dist < 0.1: + box_goal_pos_dist_reward = np.clip(- 350. * box_goal_dist, -200, 0) + box_goal_rot_dist_reward = np.clip(- rotation_distance(box_quat, target_quat)/np.pi * 100., -100, 0) + reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + else: + reward -= 300. + + return reward + + +class BoxPushingBruceSparse(BoxPushingEnvBase): + def __init__(self, frame_skip: int = 10): + super(BoxPushingBruceSparse, self).__init__(frame_skip=frame_skip) + + def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, + rod_tip_pos, rod_quat, qpos, qvel, action): + reward = 0. + joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True) + energy_cost = -0.0005 * np.sum(np.square(action)) + reward += joint_penalty + energy_cost + + if not episode_end: + return reward + + box_goal_dist = np.linalg.norm(box_pos - target_pos) + + box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100 + box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100 + + reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + self._get_end_vel_penalty() + + return reward + + def _get_end_vel_penalty(self): + rot_coeff = 150. + pos_coeff = 150. + box_rot_pos_vel = self._get_box_vel() + box_rot_vel = box_rot_pos_vel[:3] + box_pos_vel = box_rot_pos_vel[3:] + return -rot_coeff * np.linalg.norm(box_rot_vel) - pos_coeff * np.linalg.norm(box_pos_vel) + + + +if __name__=="__main__": + import fancy_gym + env = fancy_gym.make("BoxPushingDenseProDMP-v0", seed=0) + env.reset() + for i in range(1000): + env.render(mode="human") + obs, rew, done, info = env.step(env.action_space.sample()) + if done: + env.reset() + # print(f"box_end_velocity: {info['box_end_vel']}") diff --git a/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py b/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py index 09b2d65..8da6855 100644 --- a/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py +++ b/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py @@ -10,6 +10,17 @@ class MPWrapper(RawInterfaceWrapper): # Random x goal + random init pos @property def context_mask(self): + if self.random_init: + return np.hstack([ + [True] * 7, # joints position + [False] * 7, # joints velocity + [True] * 3, # position of box + [True] * 4, # orientation of box + [True] * 3, # position of target + [True] * 4, # orientation of target + # [True] * 1, # time + ]) + return np.hstack([ [False] * 7, # joints position [False] * 7, # joints velocity From 9fd6b7281bd8a5dfbde09d0b5985ca325dd18588 Mon Sep 17 00:00:00 2001 From: Dominik Roth Date: Thu, 24 Aug 2023 11:09:47 +0200 Subject: [PATCH 4/9] Updates the custom fancy envs to the versions used for the MP3 paper. (According to Hongyi only BoxPushing changed between current master and MP3 version) --- .../mujoco/box_pushing/box_pushing_env.py | 110 ++++++++++++++++-- .../envs/mujoco/box_pushing/mp_wrapper.py | 11 ++ 2 files changed, 110 insertions(+), 11 deletions(-) diff --git a/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py b/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py index 275bba1..7ecd214 100644 --- a/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py +++ b/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py @@ -26,7 +26,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): 3. time-spatial-depend sparse reward """ - def __init__(self, frame_skip: int = 10): + def __init__(self, frame_skip: int = 10, random_init: bool = False): utils.EzPickle.__init__(**locals()) self._steps = 0 self.init_qpos_box_pushing = np.array([0., 0., 0., -1.5, 0., 1.5, 0., 0., 0., 0.6, 0.45, 0.0, 1., 0., 0., 0.]) @@ -39,6 +39,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): self._desired_rod_quat = desired_rod_quat self._episode_energy = 0. + self.random_init = random_init MujocoEnv.__init__(self, model_path=os.path.join(os.path.dirname(__file__), "assets", "box_pushing.xml"), frame_skip=self.frame_skip, @@ -93,7 +94,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): def reset_model(self): # rest box to initial position self.set_state(self.init_qpos_box_pushing, self.init_qvel_box_pushing) - box_init_pos = np.array([0.4, 0.3, -0.01, 0.0, 0.0, 0.0, 1.0]) + box_init_pos = self.sample_context() if self.random_init else np.array([0.4, 0.3, -0.01, 0.0, 0.0, 0.0, 1.0]) self.data.joint("box_joint").qpos = box_init_pos # set target position @@ -160,6 +161,9 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): penalty -= v_coeff * abs(np.sum(q_dot_error[q_dot_error > 0.])) return penalty + def _get_box_vel(self): + return self.data.body("box_0").cvel.copy() + def get_body_jacp(self, name): id = mujoco.mj_name2id(self.model, 1, name) jacp = np.zeros((3, self.model.nv)) @@ -281,8 +285,8 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle): return q class BoxPushingDense(BoxPushingEnvBase): - def __init__(self, frame_skip: int = 10): - super(BoxPushingDense, self).__init__(frame_skip=frame_skip) + def __init__(self, frame_skip: int = 10, random_init: bool = False): + super(BoxPushingDense, self).__init__(frame_skip=frame_skip, random_init=random_init) def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, rod_tip_pos, rod_quat, qpos, qvel, action): joint_penalty = self._joint_limit_violate_penalty(qpos, @@ -304,14 +308,14 @@ class BoxPushingDense(BoxPushingEnvBase): return reward class BoxPushingTemporalSparse(BoxPushingEnvBase): - def __init__(self, frame_skip: int = 10): - super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip) + def __init__(self, frame_skip: int = 10, random_init: bool = False): + super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip, random_init=random_init) def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, rod_tip_pos, rod_quat, qpos, qvel, action): reward = 0. joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True) - energy_cost = -0.0005 * np.sum(np.square(action)) + energy_cost = -0.02 * np.sum(np.square(action)) tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100) reward += joint_penalty + tcp_box_dist_reward + energy_cost rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat) @@ -327,20 +331,23 @@ class BoxPushingTemporalSparse(BoxPushingEnvBase): box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100 box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100 - reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + ep_end_joint_vel = -50. * np.linalg.norm(qvel) + + reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + ep_end_joint_vel return reward + class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase): - def __init__(self, frame_skip: int = 10): - super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip) + def __init__(self, frame_skip: int = 10, random_init: bool = False): + super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip, random_init=random_init) def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, rod_tip_pos, rod_quat, qpos, qvel, action): reward = 0. joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True) - energy_cost = -0.0005 * np.sum(np.square(action)) + energy_cost = -0.02 * np.sum(np.square(action)) tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100) reward += joint_penalty + tcp_box_dist_reward + energy_cost rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat) @@ -360,3 +367,84 @@ class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase): reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward return reward + +class BoxPushingTemporalSpatialSparse2(BoxPushingEnvBase): + + def __init__(self, frame_skip: int = 10): + super(BoxPushingTemporalSpatialSparse2, self).__init__(frame_skip=frame_skip) + + def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, + rod_tip_pos, rod_quat, qpos, qvel, action): + reward = 0. + joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True) + energy_cost = -0.0005 * np.sum(np.square(action)) + tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100) + + reward += joint_penalty + energy_cost + tcp_box_dist_reward + + rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat) + + if rod_inclined_angle > np.pi / 4: + reward -= rod_inclined_angle / (np.pi) + + if not episode_end: + return reward + + # Force the robot to stop at the end + reward += -50. * np.linalg.norm(qvel) + + box_goal_dist = np.linalg.norm(box_pos - target_pos) + + if box_goal_dist < 0.1: + box_goal_pos_dist_reward = np.clip(- 350. * box_goal_dist, -200, 0) + box_goal_rot_dist_reward = np.clip(- rotation_distance(box_quat, target_quat)/np.pi * 100., -100, 0) + reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + else: + reward -= 300. + + return reward + + +class BoxPushingBruceSparse(BoxPushingEnvBase): + def __init__(self, frame_skip: int = 10): + super(BoxPushingBruceSparse, self).__init__(frame_skip=frame_skip) + + def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, + rod_tip_pos, rod_quat, qpos, qvel, action): + reward = 0. + joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True) + energy_cost = -0.0005 * np.sum(np.square(action)) + reward += joint_penalty + energy_cost + + if not episode_end: + return reward + + box_goal_dist = np.linalg.norm(box_pos - target_pos) + + box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100 + box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100 + + reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + self._get_end_vel_penalty() + + return reward + + def _get_end_vel_penalty(self): + rot_coeff = 150. + pos_coeff = 150. + box_rot_pos_vel = self._get_box_vel() + box_rot_vel = box_rot_pos_vel[:3] + box_pos_vel = box_rot_pos_vel[3:] + return -rot_coeff * np.linalg.norm(box_rot_vel) - pos_coeff * np.linalg.norm(box_pos_vel) + + + +if __name__=="__main__": + import fancy_gym + env = fancy_gym.make("BoxPushingDenseProDMP-v0", seed=0) + env.reset() + for i in range(1000): + env.render(mode="human") + obs, rew, done, info = env.step(env.action_space.sample()) + if done: + env.reset() + # print(f"box_end_velocity: {info['box_end_vel']}") diff --git a/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py b/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py index 09b2d65..8da6855 100644 --- a/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py +++ b/fancy_gym/envs/mujoco/box_pushing/mp_wrapper.py @@ -10,6 +10,17 @@ class MPWrapper(RawInterfaceWrapper): # Random x goal + random init pos @property def context_mask(self): + if self.random_init: + return np.hstack([ + [True] * 7, # joints position + [False] * 7, # joints velocity + [True] * 3, # position of box + [True] * 4, # orientation of box + [True] * 3, # position of target + [True] * 4, # orientation of target + # [True] * 1, # time + ]) + return np.hstack([ [False] * 7, # joints position [False] * 7, # joints velocity From 1c9e973bbd6a507a79c9e9c9c60869709351df56 Mon Sep 17 00:00:00 2001 From: "hongyi.zhou" Date: Mon, 15 May 2023 12:43:37 +0200 Subject: [PATCH 5/9] update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 91a91dd..e6ea8d1 100644 --- a/.gitignore +++ b/.gitignore @@ -106,6 +106,7 @@ venv.bak/ # pycharm .DS_Store .idea +.vscode #configs /configs/db.cfg From d667cd0ff2323210af1696e5f16bfc57d302b10c Mon Sep 17 00:00:00 2001 From: "hongyi.zhou" Date: Mon, 15 May 2023 13:19:04 +0200 Subject: [PATCH 6/9] add random initialized box pushing --- fancy_gym/envs/__init__.py | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/fancy_gym/envs/__init__.py b/fancy_gym/envs/__init__.py index b5bc154..32bd8f8 100644 --- a/fancy_gym/envs/__init__.py +++ b/fancy_gym/envs/__init__.py @@ -237,6 +237,12 @@ for reward_type in ["Dense", "TemporalSparse", "TemporalSpatialSparse"]: entry_point='fancy_gym.envs.mujoco:BoxPushing{}'.format(reward_type), max_episode_steps=MAX_EPISODE_STEPS_BOX_PUSHING, ) + register( + id='BoxPushingRandomInit{}-v0'.format(reward_type), + entry_point='fancy_gym.envs.mujoco:BoxPushing{}'.format(reward_type), + max_episode_steps=MAX_EPISODE_STEPS_BOX_PUSHING, + kwargs={"random_init": True} + ) # Here we use the same reward as in BeerPong-v0, but now consider after the release, # only one time step, i.e. we simulate until the end of th episode @@ -500,7 +506,9 @@ for _v in _versions: # ######################################################################################################################## ## Box Pushing -_versions = ['BoxPushingDense-v0', 'BoxPushingTemporalSparse-v0', 'BoxPushingTemporalSpatialSparse-v0'] +_versions = ['BoxPushingDense-v0', 'BoxPushingTemporalSparse-v0', 'BoxPushingTemporalSpatialSparse-v0', + 'BoxPushingRandomInitDense-v0', 'BoxPushingRandomInitTemporalSparse-v0', + 'BoxPushingRandomInitTemporalSpatialSparse-v0'] for _v in _versions: _name = _v.split("-") _env_id = f'{_name[0]}ProMP-{_name[1]}' @@ -518,6 +526,27 @@ for _v in _versions: ) ALL_FANCY_MOVEMENT_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) +for _v in _versions: + _name = _v.split("-") + _env_id = f'{_name[0]}ProDMP-{_name[1]}' + kwargs_dict_box_pushing_prodmp = deepcopy(DEFAULT_BB_DICT_ProDMP) + kwargs_dict_box_pushing_prodmp['wrappers'].append(mujoco.box_pushing.MPWrapper) + kwargs_dict_box_pushing_prodmp['name'] = _v + kwargs_dict_box_pushing_prodmp['controller_kwargs']['p_gains'] = 0.01 * np.array([120., 120., 120., 120., 50., 30., 10.]) + kwargs_dict_box_pushing_prodmp['controller_kwargs']['d_gains'] = 0.01 * np.array([10., 10., 10., 10., 6., 5., 3.]) + kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['weights_scale'] = 0.3 + kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['goal_scale'] = 0.3 + kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['auto_scale_basis'] = True + kwargs_dict_box_pushing_prodmp['basis_generator_kwargs']['num_basis'] = 4 + kwargs_dict_box_pushing_prodmp['basis_generator_kwargs']['basis_bandwidth_factor'] = 3 + kwargs_dict_box_pushing_prodmp['phase_generator_kwargs']['alpha_phase'] = 3 + register( + id=_env_id, + entry_point='fancy_gym.utils.make_env_helpers:make_bb_env_helper', + kwargs=kwargs_dict_box_pushing_prodmp + ) + ALL_FANCY_MOVEMENT_PRIMITIVE_ENVIRONMENTS["ProDMP"].append(_env_id) + for _v in _versions: _name = _v.split("-") _env_id = f'{_name[0]}ReplanProDMP-{_name[1]}' @@ -529,9 +558,7 @@ for _v in _versions: kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['weights_scale'] = 0.3 kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['goal_scale'] = 0.3 kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['auto_scale_basis'] = True - kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['goal_offset'] = 1.0 - kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['disable_goal'] = True - kwargs_dict_box_pushing_prodmp['basis_generator_kwargs']['num_basis'] = 5 + kwargs_dict_box_pushing_prodmp['basis_generator_kwargs']['num_basis'] = 4 kwargs_dict_box_pushing_prodmp['basis_generator_kwargs']['basis_bandwidth_factor'] = 3 kwargs_dict_box_pushing_prodmp['phase_generator_kwargs']['alpha_phase'] = 3 kwargs_dict_box_pushing_prodmp['black_box_kwargs']['max_planning_times'] = 4 From 025d83e2d374afce26c93292b4b86b1dd63e2995 Mon Sep 17 00:00:00 2001 From: "hongyi.zhou" Date: Tue, 5 Sep 2023 10:03:27 +0200 Subject: [PATCH 7/9] add random init options to box pushing --- .../envs/mujoco/box_pushing/box_pushing_env.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py b/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py index 7ecd214..e3c9160 100644 --- a/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py +++ b/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py @@ -405,9 +405,9 @@ class BoxPushingTemporalSpatialSparse2(BoxPushingEnvBase): return reward -class BoxPushingBruceSparse(BoxPushingEnvBase): +class BoxPushingNoConstraintSparse(BoxPushingEnvBase): def __init__(self, frame_skip: int = 10): - super(BoxPushingBruceSparse, self).__init__(frame_skip=frame_skip) + super(BoxPushingNoConstraintSparse, self).__init__(frame_skip=frame_skip) def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, rod_tip_pos, rod_quat, qpos, qvel, action): @@ -435,16 +435,3 @@ class BoxPushingBruceSparse(BoxPushingEnvBase): box_rot_vel = box_rot_pos_vel[:3] box_pos_vel = box_rot_pos_vel[3:] return -rot_coeff * np.linalg.norm(box_rot_vel) - pos_coeff * np.linalg.norm(box_pos_vel) - - - -if __name__=="__main__": - import fancy_gym - env = fancy_gym.make("BoxPushingDenseProDMP-v0", seed=0) - env.reset() - for i in range(1000): - env.render(mode="human") - obs, rew, done, info = env.step(env.action_space.sample()) - if done: - env.reset() - # print(f"box_end_velocity: {info['box_end_vel']}") From 886e1368c1bb66cbc9352ac784a1028afef2b2e2 Mon Sep 17 00:00:00 2001 From: "hongyi.zhou" Date: Tue, 5 Sep 2023 10:04:30 +0200 Subject: [PATCH 8/9] add random init flag to all box pushing env --- fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py b/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py index e3c9160..06f7e02 100644 --- a/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py +++ b/fancy_gym/envs/mujoco/box_pushing/box_pushing_env.py @@ -370,8 +370,8 @@ class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase): class BoxPushingTemporalSpatialSparse2(BoxPushingEnvBase): - def __init__(self, frame_skip: int = 10): - super(BoxPushingTemporalSpatialSparse2, self).__init__(frame_skip=frame_skip) + def __init__(self, frame_skip: int = 10, random_init: bool = False): + super(BoxPushingTemporalSpatialSparse2, self).__init__(frame_skip=frame_skip, random_init=random_init) def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, rod_tip_pos, rod_quat, qpos, qvel, action): @@ -406,8 +406,8 @@ class BoxPushingTemporalSpatialSparse2(BoxPushingEnvBase): class BoxPushingNoConstraintSparse(BoxPushingEnvBase): - def __init__(self, frame_skip: int = 10): - super(BoxPushingNoConstraintSparse, self).__init__(frame_skip=frame_skip) + def __init__(self, frame_skip: int = 10, random_init: bool = False): + super(BoxPushingNoConstraintSparse, self).__init__(frame_skip=frame_skip, random_init=random_init) def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat, rod_tip_pos, rod_quat, qpos, qvel, action): From 832102da5908a5e48ca75f8897300d7e711058af Mon Sep 17 00:00:00 2001 From: "hongyi.zhou" Date: Wed, 6 Sep 2023 12:56:43 +0200 Subject: [PATCH 9/9] use self.np_random for all the Quadrant --- fancy_gym/envs/mujoco/reacher/reacher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fancy_gym/envs/mujoco/reacher/reacher.py b/fancy_gym/envs/mujoco/reacher/reacher.py index ccd0073..e55f13a 100644 --- a/fancy_gym/envs/mujoco/reacher/reacher.py +++ b/fancy_gym/envs/mujoco/reacher/reacher.py @@ -79,11 +79,11 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): # I Quadrant # self.goal = self.np_random.uniform(low=0, high=self.n_links / 10, size=2) # II Quadrant - # self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=[0, self.n_links / 10], size=2) + # self.goal = self.np_random.uniform(low=[-self.n_links / 10, 0], high=[0, self.n_links / 10], size=2) # II + III Quadrant - # self.goal = np.random.uniform(low=-self.n_links / 10, high=[0, self.n_links / 10], size=2) + # self.goal = self.np_random.uniform(low=-self.n_links / 10, high=[0, self.n_links / 10], size=2) # I + II Quadrant - # self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=self.n_links, size=2) + # self.goal = self.np_random.uniform(low=[-self.n_links / 10, 0], high=self.n_links, size=2) if np.linalg.norm(self.goal) < self.n_links / 10: break