Merge remote-tracking branch 'original/legacy' into HEAD
This commit is contained in:
commit
34a16ea5fe
1
.gitignore
vendored
1
.gitignore
vendored
@ -106,6 +106,7 @@ venv.bak/
|
|||||||
# pycharm
|
# pycharm
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.idea
|
.idea
|
||||||
|
.vscode
|
||||||
|
|
||||||
#configs
|
#configs
|
||||||
/configs/db.cfg
|
/configs/db.cfg
|
||||||
|
@ -55,6 +55,14 @@ class BlackBoxWrapper(gym.ObservationWrapper):
|
|||||||
# self.traj_gen.set_mp_times(self.time_steps)
|
# self.traj_gen.set_mp_times(self.time_steps)
|
||||||
self.traj_gen.set_duration(self.duration, self.dt)
|
self.traj_gen.set_duration(self.duration, self.dt)
|
||||||
|
|
||||||
|
# check
|
||||||
|
self.tau_bound = [-np.inf, np.inf]
|
||||||
|
self.delay_bound = [-np.inf, np.inf]
|
||||||
|
if hasattr(self.traj_gen.phase_gn, "tau_bound"):
|
||||||
|
self.tau_bound = self.traj_gen.phase_gn.tau_bound
|
||||||
|
if hasattr(self.traj_gen.phase_gn, "delay_bound"):
|
||||||
|
self.delay_bound = self.traj_gen.phase_gn.delay_bound
|
||||||
|
|
||||||
# reward computation
|
# reward computation
|
||||||
self.reward_aggregation = reward_aggregation
|
self.reward_aggregation = reward_aggregation
|
||||||
|
|
||||||
@ -139,7 +147,8 @@ class BlackBoxWrapper(gym.ObservationWrapper):
|
|||||||
|
|
||||||
position, velocity = self.get_trajectory(action)
|
position, velocity = self.get_trajectory(action)
|
||||||
position, velocity = self.env.set_episode_arguments(action, position, velocity)
|
position, velocity = self.env.set_episode_arguments(action, position, velocity)
|
||||||
traj_is_valid, position, velocity = self.env.preprocessing_and_validity_callback(action, position, velocity)
|
traj_is_valid, position, velocity = self.env.preprocessing_and_validity_callback(action, position, velocity,
|
||||||
|
self.tau_bound, self.delay_bound)
|
||||||
|
|
||||||
trajectory_length = len(position)
|
trajectory_length = len(position)
|
||||||
rewards = np.zeros(shape=(trajectory_length,))
|
rewards = np.zeros(shape=(trajectory_length,))
|
||||||
@ -153,7 +162,8 @@ class BlackBoxWrapper(gym.ObservationWrapper):
|
|||||||
|
|
||||||
if not traj_is_valid:
|
if not traj_is_valid:
|
||||||
obs, trajectory_return, done, infos = self.env.invalid_traj_callback(action, position, velocity,
|
obs, trajectory_return, done, infos = self.env.invalid_traj_callback(action, position, velocity,
|
||||||
self.return_context_observation)
|
self.return_context_observation,
|
||||||
|
self.tau_bound, self.delay_bound)
|
||||||
return self.observation(obs), trajectory_return, done, infos
|
return self.observation(obs), trajectory_return, done, infos
|
||||||
|
|
||||||
self.plan_steps += 1
|
self.plan_steps += 1
|
||||||
|
@ -52,7 +52,8 @@ class RawInterfaceWrapper(gym.Wrapper):
|
|||||||
"""
|
"""
|
||||||
return self.env.dt
|
return self.env.dt
|
||||||
|
|
||||||
def preprocessing_and_validity_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray) \
|
def preprocessing_and_validity_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray,
|
||||||
|
tau_bound: list = None, delay_bound: list = None ) \
|
||||||
-> Tuple[bool, np.ndarray, np.ndarray]:
|
-> Tuple[bool, np.ndarray, np.ndarray]:
|
||||||
"""
|
"""
|
||||||
Used to preprocess the action and check if the desired trajectory is valid.
|
Used to preprocess the action and check if the desired trajectory is valid.
|
||||||
@ -61,6 +62,8 @@ class RawInterfaceWrapper(gym.Wrapper):
|
|||||||
specified, else only traj_gen parameters
|
specified, else only traj_gen parameters
|
||||||
pos_traj: a vector instance of the raw position trajectory
|
pos_traj: a vector instance of the raw position trajectory
|
||||||
vel_traj: a vector instance of the raw velocity trajectory
|
vel_traj: a vector instance of the raw velocity trajectory
|
||||||
|
tau_bound: a list of two elements, the lower and upper bound of the trajectory length scaling factor
|
||||||
|
delay_bound: a list of two elements, the lower and upper bound of the time to wait before execute
|
||||||
Returns:
|
Returns:
|
||||||
validity flag: bool, True if the raw trajectory is valid, False if not
|
validity flag: bool, True if the raw trajectory is valid, False if not
|
||||||
pos_traj: a vector instance of the preprocessed position trajectory
|
pos_traj: a vector instance of the preprocessed position trajectory
|
||||||
@ -97,7 +100,8 @@ class RawInterfaceWrapper(gym.Wrapper):
|
|||||||
"""
|
"""
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def invalid_traj_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray) -> Tuple[np.ndarray, float, bool, dict]:
|
def invalid_traj_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray,
|
||||||
|
tau_bound: list, delay_bound: list) -> Tuple[np.ndarray, float, bool, dict]:
|
||||||
"""
|
"""
|
||||||
Used to return a artificial return from the env if the desired trajectory is invalid.
|
Used to return a artificial return from the env if the desired trajectory is invalid.
|
||||||
Args:
|
Args:
|
||||||
@ -105,6 +109,8 @@ class RawInterfaceWrapper(gym.Wrapper):
|
|||||||
specified, else only traj_gen parameters
|
specified, else only traj_gen parameters
|
||||||
pos_traj: a vector instance of the raw position trajectory
|
pos_traj: a vector instance of the raw position trajectory
|
||||||
vel_traj: a vector instance of the raw velocity trajectory
|
vel_traj: a vector instance of the raw velocity trajectory
|
||||||
|
tau_bound: a list of two elements, the lower and upper bound of the trajectory length scaling factor
|
||||||
|
delay_bound: a list of two elements, the lower and upper bound of the time to wait before execute
|
||||||
Returns:
|
Returns:
|
||||||
obs: artificial observation if the trajectory is invalid, by default a zero vector
|
obs: artificial observation if the trajectory is invalid, by default a zero vector
|
||||||
reward: artificial reward if the trajectory is invalid, by default 0
|
reward: artificial reward if the trajectory is invalid, by default 0
|
||||||
|
@ -237,6 +237,12 @@ for reward_type in ["Dense", "TemporalSparse", "TemporalSpatialSparse"]:
|
|||||||
entry_point='fancy_gym.envs.mujoco:BoxPushing{}'.format(reward_type),
|
entry_point='fancy_gym.envs.mujoco:BoxPushing{}'.format(reward_type),
|
||||||
max_episode_steps=MAX_EPISODE_STEPS_BOX_PUSHING,
|
max_episode_steps=MAX_EPISODE_STEPS_BOX_PUSHING,
|
||||||
)
|
)
|
||||||
|
register(
|
||||||
|
id='BoxPushingRandomInit{}-v0'.format(reward_type),
|
||||||
|
entry_point='fancy_gym.envs.mujoco:BoxPushing{}'.format(reward_type),
|
||||||
|
max_episode_steps=MAX_EPISODE_STEPS_BOX_PUSHING,
|
||||||
|
kwargs={"random_init": True}
|
||||||
|
)
|
||||||
|
|
||||||
# Here we use the same reward as in BeerPong-v0, but now consider after the release,
|
# Here we use the same reward as in BeerPong-v0, but now consider after the release,
|
||||||
# only one time step, i.e. we simulate until the end of th episode
|
# only one time step, i.e. we simulate until the end of th episode
|
||||||
@ -500,7 +506,9 @@ for _v in _versions:
|
|||||||
# ########################################################################################################################
|
# ########################################################################################################################
|
||||||
|
|
||||||
## Box Pushing
|
## Box Pushing
|
||||||
_versions = ['BoxPushingDense-v0', 'BoxPushingTemporalSparse-v0', 'BoxPushingTemporalSpatialSparse-v0']
|
_versions = ['BoxPushingDense-v0', 'BoxPushingTemporalSparse-v0', 'BoxPushingTemporalSpatialSparse-v0',
|
||||||
|
'BoxPushingRandomInitDense-v0', 'BoxPushingRandomInitTemporalSparse-v0',
|
||||||
|
'BoxPushingRandomInitTemporalSpatialSparse-v0']
|
||||||
for _v in _versions:
|
for _v in _versions:
|
||||||
_name = _v.split("-")
|
_name = _v.split("-")
|
||||||
_env_id = f'{_name[0]}ProMP-{_name[1]}'
|
_env_id = f'{_name[0]}ProMP-{_name[1]}'
|
||||||
@ -518,6 +526,27 @@ for _v in _versions:
|
|||||||
)
|
)
|
||||||
ALL_FANCY_MOVEMENT_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
|
ALL_FANCY_MOVEMENT_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
|
||||||
|
|
||||||
|
for _v in _versions:
|
||||||
|
_name = _v.split("-")
|
||||||
|
_env_id = f'{_name[0]}ProDMP-{_name[1]}'
|
||||||
|
kwargs_dict_box_pushing_prodmp = deepcopy(DEFAULT_BB_DICT_ProDMP)
|
||||||
|
kwargs_dict_box_pushing_prodmp['wrappers'].append(mujoco.box_pushing.MPWrapper)
|
||||||
|
kwargs_dict_box_pushing_prodmp['name'] = _v
|
||||||
|
kwargs_dict_box_pushing_prodmp['controller_kwargs']['p_gains'] = 0.01 * np.array([120., 120., 120., 120., 50., 30., 10.])
|
||||||
|
kwargs_dict_box_pushing_prodmp['controller_kwargs']['d_gains'] = 0.01 * np.array([10., 10., 10., 10., 6., 5., 3.])
|
||||||
|
kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['weights_scale'] = 0.3
|
||||||
|
kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['goal_scale'] = 0.3
|
||||||
|
kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['auto_scale_basis'] = True
|
||||||
|
kwargs_dict_box_pushing_prodmp['basis_generator_kwargs']['num_basis'] = 4
|
||||||
|
kwargs_dict_box_pushing_prodmp['basis_generator_kwargs']['basis_bandwidth_factor'] = 3
|
||||||
|
kwargs_dict_box_pushing_prodmp['phase_generator_kwargs']['alpha_phase'] = 3
|
||||||
|
register(
|
||||||
|
id=_env_id,
|
||||||
|
entry_point='fancy_gym.utils.make_env_helpers:make_bb_env_helper',
|
||||||
|
kwargs=kwargs_dict_box_pushing_prodmp
|
||||||
|
)
|
||||||
|
ALL_FANCY_MOVEMENT_PRIMITIVE_ENVIRONMENTS["ProDMP"].append(_env_id)
|
||||||
|
|
||||||
for _v in _versions:
|
for _v in _versions:
|
||||||
_name = _v.split("-")
|
_name = _v.split("-")
|
||||||
_env_id = f'{_name[0]}ReplanProDMP-{_name[1]}'
|
_env_id = f'{_name[0]}ReplanProDMP-{_name[1]}'
|
||||||
@ -529,9 +558,7 @@ for _v in _versions:
|
|||||||
kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['weights_scale'] = 0.3
|
kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['weights_scale'] = 0.3
|
||||||
kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['goal_scale'] = 0.3
|
kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['goal_scale'] = 0.3
|
||||||
kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['auto_scale_basis'] = True
|
kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['auto_scale_basis'] = True
|
||||||
kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['goal_offset'] = 1.0
|
kwargs_dict_box_pushing_prodmp['basis_generator_kwargs']['num_basis'] = 4
|
||||||
kwargs_dict_box_pushing_prodmp['trajectory_generator_kwargs']['disable_goal'] = True
|
|
||||||
kwargs_dict_box_pushing_prodmp['basis_generator_kwargs']['num_basis'] = 5
|
|
||||||
kwargs_dict_box_pushing_prodmp['basis_generator_kwargs']['basis_bandwidth_factor'] = 3
|
kwargs_dict_box_pushing_prodmp['basis_generator_kwargs']['basis_bandwidth_factor'] = 3
|
||||||
kwargs_dict_box_pushing_prodmp['phase_generator_kwargs']['alpha_phase'] = 3
|
kwargs_dict_box_pushing_prodmp['phase_generator_kwargs']['alpha_phase'] = 3
|
||||||
kwargs_dict_box_pushing_prodmp['black_box_kwargs']['max_planning_times'] = 4
|
kwargs_dict_box_pushing_prodmp['black_box_kwargs']['max_planning_times'] = 4
|
||||||
@ -557,8 +584,8 @@ for _v in _versions:
|
|||||||
kwargs_dict_tt_promp['name'] = _v
|
kwargs_dict_tt_promp['name'] = _v
|
||||||
kwargs_dict_tt_promp['controller_kwargs']['p_gains'] = 0.5 * np.array([1.0, 4.0, 2.0, 4.0, 1.0, 4.0, 1.0])
|
kwargs_dict_tt_promp['controller_kwargs']['p_gains'] = 0.5 * np.array([1.0, 4.0, 2.0, 4.0, 1.0, 4.0, 1.0])
|
||||||
kwargs_dict_tt_promp['controller_kwargs']['d_gains'] = 0.5 * np.array([0.1, 0.4, 0.2, 0.4, 0.1, 0.4, 0.1])
|
kwargs_dict_tt_promp['controller_kwargs']['d_gains'] = 0.5 * np.array([0.1, 0.4, 0.2, 0.4, 0.1, 0.4, 0.1])
|
||||||
kwargs_dict_tt_promp['phase_generator_kwargs']['learn_tau'] = False
|
kwargs_dict_tt_promp['phase_generator_kwargs']['learn_tau'] = True
|
||||||
kwargs_dict_tt_promp['phase_generator_kwargs']['learn_delay'] = False
|
kwargs_dict_tt_promp['phase_generator_kwargs']['learn_delay'] = True
|
||||||
kwargs_dict_tt_promp['phase_generator_kwargs']['tau_bound'] = [0.8, 1.5]
|
kwargs_dict_tt_promp['phase_generator_kwargs']['tau_bound'] = [0.8, 1.5]
|
||||||
kwargs_dict_tt_promp['phase_generator_kwargs']['delay_bound'] = [0.05, 0.15]
|
kwargs_dict_tt_promp['phase_generator_kwargs']['delay_bound'] = [0.05, 0.15]
|
||||||
kwargs_dict_tt_promp['basis_generator_kwargs']['num_basis'] = 3
|
kwargs_dict_tt_promp['basis_generator_kwargs']['num_basis'] = 3
|
||||||
|
@ -26,7 +26,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
|
|||||||
3. time-spatial-depend sparse reward
|
3. time-spatial-depend sparse reward
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, frame_skip: int = 10):
|
def __init__(self, frame_skip: int = 10, random_init: bool = False):
|
||||||
utils.EzPickle.__init__(**locals())
|
utils.EzPickle.__init__(**locals())
|
||||||
self._steps = 0
|
self._steps = 0
|
||||||
self.init_qpos_box_pushing = np.array([0., 0., 0., -1.5, 0., 1.5, 0., 0., 0., 0.6, 0.45, 0.0, 1., 0., 0., 0.])
|
self.init_qpos_box_pushing = np.array([0., 0., 0., -1.5, 0., 1.5, 0., 0., 0., 0.6, 0.45, 0.0, 1., 0., 0., 0.])
|
||||||
@ -39,6 +39,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
|
|||||||
self._desired_rod_quat = desired_rod_quat
|
self._desired_rod_quat = desired_rod_quat
|
||||||
|
|
||||||
self._episode_energy = 0.
|
self._episode_energy = 0.
|
||||||
|
self.random_init = random_init
|
||||||
MujocoEnv.__init__(self,
|
MujocoEnv.__init__(self,
|
||||||
model_path=os.path.join(os.path.dirname(__file__), "assets", "box_pushing.xml"),
|
model_path=os.path.join(os.path.dirname(__file__), "assets", "box_pushing.xml"),
|
||||||
frame_skip=self.frame_skip,
|
frame_skip=self.frame_skip,
|
||||||
@ -93,7 +94,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
|
|||||||
def reset_model(self):
|
def reset_model(self):
|
||||||
# rest box to initial position
|
# rest box to initial position
|
||||||
self.set_state(self.init_qpos_box_pushing, self.init_qvel_box_pushing)
|
self.set_state(self.init_qpos_box_pushing, self.init_qvel_box_pushing)
|
||||||
box_init_pos = np.array([0.4, 0.3, -0.01, 0.0, 0.0, 0.0, 1.0])
|
box_init_pos = self.sample_context() if self.random_init else np.array([0.4, 0.3, -0.01, 0.0, 0.0, 0.0, 1.0])
|
||||||
self.data.joint("box_joint").qpos = box_init_pos
|
self.data.joint("box_joint").qpos = box_init_pos
|
||||||
|
|
||||||
# set target position
|
# set target position
|
||||||
@ -160,6 +161,9 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
|
|||||||
penalty -= v_coeff * abs(np.sum(q_dot_error[q_dot_error > 0.]))
|
penalty -= v_coeff * abs(np.sum(q_dot_error[q_dot_error > 0.]))
|
||||||
return penalty
|
return penalty
|
||||||
|
|
||||||
|
def _get_box_vel(self):
|
||||||
|
return self.data.body("box_0").cvel.copy()
|
||||||
|
|
||||||
def get_body_jacp(self, name):
|
def get_body_jacp(self, name):
|
||||||
id = mujoco.mj_name2id(self.model, 1, name)
|
id = mujoco.mj_name2id(self.model, 1, name)
|
||||||
jacp = np.zeros((3, self.model.nv))
|
jacp = np.zeros((3, self.model.nv))
|
||||||
@ -281,8 +285,8 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
|
|||||||
return q
|
return q
|
||||||
|
|
||||||
class BoxPushingDense(BoxPushingEnvBase):
|
class BoxPushingDense(BoxPushingEnvBase):
|
||||||
def __init__(self, frame_skip: int = 10):
|
def __init__(self, frame_skip: int = 10, random_init: bool = False):
|
||||||
super(BoxPushingDense, self).__init__(frame_skip=frame_skip)
|
super(BoxPushingDense, self).__init__(frame_skip=frame_skip, random_init=random_init)
|
||||||
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
rod_tip_pos, rod_quat, qpos, qvel, action):
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
joint_penalty = self._joint_limit_violate_penalty(qpos,
|
joint_penalty = self._joint_limit_violate_penalty(qpos,
|
||||||
@ -304,14 +308,14 @@ class BoxPushingDense(BoxPushingEnvBase):
|
|||||||
return reward
|
return reward
|
||||||
|
|
||||||
class BoxPushingTemporalSparse(BoxPushingEnvBase):
|
class BoxPushingTemporalSparse(BoxPushingEnvBase):
|
||||||
def __init__(self, frame_skip: int = 10):
|
def __init__(self, frame_skip: int = 10, random_init: bool = False):
|
||||||
super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip)
|
super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip, random_init=random_init)
|
||||||
|
|
||||||
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
rod_tip_pos, rod_quat, qpos, qvel, action):
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
reward = 0.
|
reward = 0.
|
||||||
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
||||||
energy_cost = -0.0005 * np.sum(np.square(action))
|
energy_cost = -0.02 * np.sum(np.square(action))
|
||||||
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
||||||
reward += joint_penalty + tcp_box_dist_reward + energy_cost
|
reward += joint_penalty + tcp_box_dist_reward + energy_cost
|
||||||
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
||||||
@ -327,20 +331,23 @@ class BoxPushingTemporalSparse(BoxPushingEnvBase):
|
|||||||
box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100
|
box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100
|
||||||
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100
|
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100
|
||||||
|
|
||||||
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
|
ep_end_joint_vel = -50. * np.linalg.norm(qvel)
|
||||||
|
|
||||||
|
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + ep_end_joint_vel
|
||||||
|
|
||||||
return reward
|
return reward
|
||||||
|
|
||||||
|
|
||||||
class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase):
|
class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase):
|
||||||
|
|
||||||
def __init__(self, frame_skip: int = 10):
|
def __init__(self, frame_skip: int = 10, random_init: bool = False):
|
||||||
super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip)
|
super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip, random_init=random_init)
|
||||||
|
|
||||||
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
rod_tip_pos, rod_quat, qpos, qvel, action):
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
reward = 0.
|
reward = 0.
|
||||||
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
||||||
energy_cost = -0.0005 * np.sum(np.square(action))
|
energy_cost = -0.02 * np.sum(np.square(action))
|
||||||
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
||||||
reward += joint_penalty + tcp_box_dist_reward + energy_cost
|
reward += joint_penalty + tcp_box_dist_reward + energy_cost
|
||||||
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
||||||
@ -360,3 +367,71 @@ class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase):
|
|||||||
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
|
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
|
||||||
|
|
||||||
return reward
|
return reward
|
||||||
|
|
||||||
|
class BoxPushingTemporalSpatialSparse2(BoxPushingEnvBase):
|
||||||
|
|
||||||
|
def __init__(self, frame_skip: int = 10, random_init: bool = False):
|
||||||
|
super(BoxPushingTemporalSpatialSparse2, self).__init__(frame_skip=frame_skip, random_init=random_init)
|
||||||
|
|
||||||
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
|
reward = 0.
|
||||||
|
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
||||||
|
energy_cost = -0.0005 * np.sum(np.square(action))
|
||||||
|
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
||||||
|
|
||||||
|
reward += joint_penalty + energy_cost + tcp_box_dist_reward
|
||||||
|
|
||||||
|
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
||||||
|
|
||||||
|
if rod_inclined_angle > np.pi / 4:
|
||||||
|
reward -= rod_inclined_angle / (np.pi)
|
||||||
|
|
||||||
|
if not episode_end:
|
||||||
|
return reward
|
||||||
|
|
||||||
|
# Force the robot to stop at the end
|
||||||
|
reward += -50. * np.linalg.norm(qvel)
|
||||||
|
|
||||||
|
box_goal_dist = np.linalg.norm(box_pos - target_pos)
|
||||||
|
|
||||||
|
if box_goal_dist < 0.1:
|
||||||
|
box_goal_pos_dist_reward = np.clip(- 350. * box_goal_dist, -200, 0)
|
||||||
|
box_goal_rot_dist_reward = np.clip(- rotation_distance(box_quat, target_quat)/np.pi * 100., -100, 0)
|
||||||
|
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
|
||||||
|
else:
|
||||||
|
reward -= 300.
|
||||||
|
|
||||||
|
return reward
|
||||||
|
|
||||||
|
|
||||||
|
class BoxPushingNoConstraintSparse(BoxPushingEnvBase):
|
||||||
|
def __init__(self, frame_skip: int = 10, random_init: bool = False):
|
||||||
|
super(BoxPushingNoConstraintSparse, self).__init__(frame_skip=frame_skip, random_init=random_init)
|
||||||
|
|
||||||
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
|
reward = 0.
|
||||||
|
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
||||||
|
energy_cost = -0.0005 * np.sum(np.square(action))
|
||||||
|
reward += joint_penalty + energy_cost
|
||||||
|
|
||||||
|
if not episode_end:
|
||||||
|
return reward
|
||||||
|
|
||||||
|
box_goal_dist = np.linalg.norm(box_pos - target_pos)
|
||||||
|
|
||||||
|
box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100
|
||||||
|
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100
|
||||||
|
|
||||||
|
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + self._get_end_vel_penalty()
|
||||||
|
|
||||||
|
return reward
|
||||||
|
|
||||||
|
def _get_end_vel_penalty(self):
|
||||||
|
rot_coeff = 150.
|
||||||
|
pos_coeff = 150.
|
||||||
|
box_rot_pos_vel = self._get_box_vel()
|
||||||
|
box_rot_vel = box_rot_pos_vel[:3]
|
||||||
|
box_pos_vel = box_rot_pos_vel[3:]
|
||||||
|
return -rot_coeff * np.linalg.norm(box_rot_vel) - pos_coeff * np.linalg.norm(box_pos_vel)
|
||||||
|
@ -10,6 +10,17 @@ class MPWrapper(RawInterfaceWrapper):
|
|||||||
# Random x goal + random init pos
|
# Random x goal + random init pos
|
||||||
@property
|
@property
|
||||||
def context_mask(self):
|
def context_mask(self):
|
||||||
|
if self.random_init:
|
||||||
|
return np.hstack([
|
||||||
|
[True] * 7, # joints position
|
||||||
|
[False] * 7, # joints velocity
|
||||||
|
[True] * 3, # position of box
|
||||||
|
[True] * 4, # orientation of box
|
||||||
|
[True] * 3, # position of target
|
||||||
|
[True] * 4, # orientation of target
|
||||||
|
# [True] * 1, # time
|
||||||
|
])
|
||||||
|
|
||||||
return np.hstack([
|
return np.hstack([
|
||||||
[False] * 7, # joints position
|
[False] * 7, # joints position
|
||||||
[False] * 7, # joints velocity
|
[False] * 7, # joints velocity
|
||||||
|
@ -79,11 +79,11 @@ class ReacherEnv(MujocoEnv, utils.EzPickle):
|
|||||||
# I Quadrant
|
# I Quadrant
|
||||||
# self.goal = self.np_random.uniform(low=0, high=self.n_links / 10, size=2)
|
# self.goal = self.np_random.uniform(low=0, high=self.n_links / 10, size=2)
|
||||||
# II Quadrant
|
# II Quadrant
|
||||||
# self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=[0, self.n_links / 10], size=2)
|
# self.goal = self.np_random.uniform(low=[-self.n_links / 10, 0], high=[0, self.n_links / 10], size=2)
|
||||||
# II + III Quadrant
|
# II + III Quadrant
|
||||||
# self.goal = np.random.uniform(low=-self.n_links / 10, high=[0, self.n_links / 10], size=2)
|
# self.goal = self.np_random.uniform(low=-self.n_links / 10, high=[0, self.n_links / 10], size=2)
|
||||||
# I + II Quadrant
|
# I + II Quadrant
|
||||||
# self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=self.n_links, size=2)
|
# self.goal = self.np_random.uniform(low=[-self.n_links / 10, 0], high=self.n_links, size=2)
|
||||||
if np.linalg.norm(self.goal) < self.n_links / 10:
|
if np.linalg.norm(self.goal) < self.n_links / 10:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -29,15 +29,16 @@ class TT_MPWrapper(RawInterfaceWrapper):
|
|||||||
def current_vel(self) -> Union[float, int, np.ndarray, Tuple]:
|
def current_vel(self) -> Union[float, int, np.ndarray, Tuple]:
|
||||||
return self.data.qvel[:7].copy()
|
return self.data.qvel[:7].copy()
|
||||||
|
|
||||||
def preprocessing_and_validity_callback(self, action, pos_traj, vel_traj):
|
def preprocessing_and_validity_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray,
|
||||||
return self.check_traj_validity(action, pos_traj, vel_traj)
|
tau_bound: list, delay_bound:list):
|
||||||
|
return self.check_traj_validity(action, pos_traj, vel_traj, tau_bound, delay_bound)
|
||||||
|
|
||||||
def set_episode_arguments(self, action, pos_traj, vel_traj):
|
def set_episode_arguments(self, action, pos_traj, vel_traj):
|
||||||
return pos_traj, vel_traj
|
return pos_traj, vel_traj
|
||||||
|
|
||||||
def invalid_traj_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray,
|
def invalid_traj_callback(self, action: np.ndarray, pos_traj: np.ndarray, vel_traj: np.ndarray,
|
||||||
return_contextual_obs: bool) -> Tuple[np.ndarray, float, bool, dict]:
|
return_contextual_obs: bool, tau_bound:list, delay_bound:list) -> Tuple[np.ndarray, float, bool, dict]:
|
||||||
return self.get_invalid_traj_step_return(action, pos_traj, return_contextual_obs)
|
return self.get_invalid_traj_step_return(action, pos_traj, return_contextual_obs, tau_bound, delay_bound)
|
||||||
|
|
||||||
class TTVelObs_MPWrapper(TT_MPWrapper):
|
class TTVelObs_MPWrapper(TT_MPWrapper):
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ from gym import utils, spaces
|
|||||||
from gym.envs.mujoco import MujocoEnv
|
from gym.envs.mujoco import MujocoEnv
|
||||||
|
|
||||||
from fancy_gym.envs.mujoco.table_tennis.table_tennis_utils import is_init_state_valid, magnus_force
|
from fancy_gym.envs.mujoco.table_tennis.table_tennis_utils import is_init_state_valid, magnus_force
|
||||||
from fancy_gym.envs.mujoco.table_tennis.table_tennis_utils import jnt_pos_low, jnt_pos_high, delay_bound, tau_bound
|
from fancy_gym.envs.mujoco.table_tennis.table_tennis_utils import jnt_pos_low, jnt_pos_high
|
||||||
|
|
||||||
import mujoco
|
import mujoco
|
||||||
|
|
||||||
@ -225,7 +225,7 @@ class TableTennisEnv(MujocoEnv, utils.EzPickle):
|
|||||||
init_ball_state = self._generate_random_ball(random_pos=random_pos, random_vel=random_vel)
|
init_ball_state = self._generate_random_ball(random_pos=random_pos, random_vel=random_vel)
|
||||||
return init_ball_state
|
return init_ball_state
|
||||||
|
|
||||||
def _get_traj_invalid_penalty(self, action, pos_traj):
|
def _get_traj_invalid_penalty(self, action, pos_traj, tau_bound, delay_bound):
|
||||||
tau_invalid_penalty = 3 * (np.max([0, action[0] - tau_bound[1]]) + np.max([0, tau_bound[0] - action[0]]))
|
tau_invalid_penalty = 3 * (np.max([0, action[0] - tau_bound[1]]) + np.max([0, tau_bound[0] - action[0]]))
|
||||||
delay_invalid_penalty = 3 * (np.max([0, action[1] - delay_bound[1]]) + np.max([0, delay_bound[0] - action[1]]))
|
delay_invalid_penalty = 3 * (np.max([0, action[1] - delay_bound[1]]) + np.max([0, delay_bound[0] - action[1]]))
|
||||||
violate_high_bound_error = np.mean(np.maximum(pos_traj - jnt_pos_high, 0))
|
violate_high_bound_error = np.mean(np.maximum(pos_traj - jnt_pos_high, 0))
|
||||||
@ -234,9 +234,9 @@ class TableTennisEnv(MujocoEnv, utils.EzPickle):
|
|||||||
violate_high_bound_error + violate_low_bound_error
|
violate_high_bound_error + violate_low_bound_error
|
||||||
return -invalid_penalty
|
return -invalid_penalty
|
||||||
|
|
||||||
def get_invalid_traj_step_return(self, action, pos_traj, contextual_obs):
|
def get_invalid_traj_step_return(self, action, pos_traj, contextual_obs, tau_bound, delay_bound):
|
||||||
obs = self._get_obs() if contextual_obs else np.concatenate([self._get_obs(), np.array([0])]) # 0 for invalid traj
|
obs = self._get_obs() if contextual_obs else np.concatenate([self._get_obs(), np.array([0])]) # 0 for invalid traj
|
||||||
penalty = self._get_traj_invalid_penalty(action, pos_traj)
|
penalty = self._get_traj_invalid_penalty(action, pos_traj, tau_bound, delay_bound)
|
||||||
return obs, penalty, True, {
|
return obs, penalty, True, {
|
||||||
"hit_ball": [False],
|
"hit_ball": [False],
|
||||||
"ball_returned_success": [False],
|
"ball_returned_success": [False],
|
||||||
@ -247,7 +247,7 @@ class TableTennisEnv(MujocoEnv, utils.EzPickle):
|
|||||||
}
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def check_traj_validity(action, pos_traj, vel_traj):
|
def check_traj_validity(action, pos_traj, vel_traj, tau_bound, delay_bound):
|
||||||
time_invalid = action[0] > tau_bound[1] or action[0] < tau_bound[0] \
|
time_invalid = action[0] > tau_bound[1] or action[0] < tau_bound[0] \
|
||||||
or action[1] > delay_bound[1] or action[1] < delay_bound[0]
|
or action[1] > delay_bound[1] or action[1] < delay_bound[0]
|
||||||
if time_invalid or np.any(pos_traj > jnt_pos_high) or np.any(pos_traj < jnt_pos_low):
|
if time_invalid or np.any(pos_traj > jnt_pos_high) or np.any(pos_traj < jnt_pos_low):
|
||||||
|
@ -31,8 +31,8 @@ ax = fig.add_subplot(1, 1, 1)
|
|||||||
img = ax.imshow(env.env.render(mode="rgb_array"))
|
img = ax.imshow(env.env.render(mode="rgb_array"))
|
||||||
fig.show()
|
fig.show()
|
||||||
|
|
||||||
for t, pos_vel in enumerate(zip(pos, vel)):
|
for t, (des_pos, des_vel) in enumerate(zip(pos, vel)):
|
||||||
actions = env.tracking_controller.get_action(pos_vel[0], pos_vel[1], env.current_vel, env.current_pos)
|
actions = env.tracking_controller.get_action(des_pos, des_vel, env.current_pos, env.current_vel)
|
||||||
actions = np.clip(actions, env.env.action_space.low, env.env.action_space.high)
|
actions = np.clip(actions, env.env.action_space.low, env.env.action_space.high)
|
||||||
_, _, _, _ = env.env.step(actions)
|
_, _, _, _ = env.env.step(actions)
|
||||||
if t % 15 == 0:
|
if t % 15 == 0:
|
||||||
|
Loading…
Reference in New Issue
Block a user