Merge pull request #72 from D-o-d-o-x/MP3_env_changes
Update custom envs to versions used in MP3 paper
This commit is contained in:
commit
ea3bc46f0a
@ -26,7 +26,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
|
|||||||
3. time-spatial-depend sparse reward
|
3. time-spatial-depend sparse reward
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, frame_skip: int = 10):
|
def __init__(self, frame_skip: int = 10, random_init: bool = False):
|
||||||
utils.EzPickle.__init__(**locals())
|
utils.EzPickle.__init__(**locals())
|
||||||
self._steps = 0
|
self._steps = 0
|
||||||
self.init_qpos_box_pushing = np.array([0., 0., 0., -1.5, 0., 1.5, 0., 0., 0., 0.6, 0.45, 0.0, 1., 0., 0., 0.])
|
self.init_qpos_box_pushing = np.array([0., 0., 0., -1.5, 0., 1.5, 0., 0., 0., 0.6, 0.45, 0.0, 1., 0., 0., 0.])
|
||||||
@ -39,6 +39,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
|
|||||||
self._desired_rod_quat = desired_rod_quat
|
self._desired_rod_quat = desired_rod_quat
|
||||||
|
|
||||||
self._episode_energy = 0.
|
self._episode_energy = 0.
|
||||||
|
self.random_init = random_init
|
||||||
MujocoEnv.__init__(self,
|
MujocoEnv.__init__(self,
|
||||||
model_path=os.path.join(os.path.dirname(__file__), "assets", "box_pushing.xml"),
|
model_path=os.path.join(os.path.dirname(__file__), "assets", "box_pushing.xml"),
|
||||||
frame_skip=self.frame_skip,
|
frame_skip=self.frame_skip,
|
||||||
@ -93,7 +94,7 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
|
|||||||
def reset_model(self):
|
def reset_model(self):
|
||||||
# rest box to initial position
|
# rest box to initial position
|
||||||
self.set_state(self.init_qpos_box_pushing, self.init_qvel_box_pushing)
|
self.set_state(self.init_qpos_box_pushing, self.init_qvel_box_pushing)
|
||||||
box_init_pos = np.array([0.4, 0.3, -0.01, 0.0, 0.0, 0.0, 1.0])
|
box_init_pos = self.sample_context() if self.random_init else np.array([0.4, 0.3, -0.01, 0.0, 0.0, 0.0, 1.0])
|
||||||
self.data.joint("box_joint").qpos = box_init_pos
|
self.data.joint("box_joint").qpos = box_init_pos
|
||||||
|
|
||||||
# set target position
|
# set target position
|
||||||
@ -160,6 +161,9 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
|
|||||||
penalty -= v_coeff * abs(np.sum(q_dot_error[q_dot_error > 0.]))
|
penalty -= v_coeff * abs(np.sum(q_dot_error[q_dot_error > 0.]))
|
||||||
return penalty
|
return penalty
|
||||||
|
|
||||||
|
def _get_box_vel(self):
|
||||||
|
return self.data.body("box_0").cvel.copy()
|
||||||
|
|
||||||
def get_body_jacp(self, name):
|
def get_body_jacp(self, name):
|
||||||
id = mujoco.mj_name2id(self.model, 1, name)
|
id = mujoco.mj_name2id(self.model, 1, name)
|
||||||
jacp = np.zeros((3, self.model.nv))
|
jacp = np.zeros((3, self.model.nv))
|
||||||
@ -281,8 +285,8 @@ class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
|
|||||||
return q
|
return q
|
||||||
|
|
||||||
class BoxPushingDense(BoxPushingEnvBase):
|
class BoxPushingDense(BoxPushingEnvBase):
|
||||||
def __init__(self, frame_skip: int = 10):
|
def __init__(self, frame_skip: int = 10, random_init: bool = False):
|
||||||
super(BoxPushingDense, self).__init__(frame_skip=frame_skip)
|
super(BoxPushingDense, self).__init__(frame_skip=frame_skip, random_init=random_init)
|
||||||
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
rod_tip_pos, rod_quat, qpos, qvel, action):
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
joint_penalty = self._joint_limit_violate_penalty(qpos,
|
joint_penalty = self._joint_limit_violate_penalty(qpos,
|
||||||
@ -304,14 +308,14 @@ class BoxPushingDense(BoxPushingEnvBase):
|
|||||||
return reward
|
return reward
|
||||||
|
|
||||||
class BoxPushingTemporalSparse(BoxPushingEnvBase):
|
class BoxPushingTemporalSparse(BoxPushingEnvBase):
|
||||||
def __init__(self, frame_skip: int = 10):
|
def __init__(self, frame_skip: int = 10, random_init: bool = False):
|
||||||
super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip)
|
super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip, random_init=random_init)
|
||||||
|
|
||||||
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
rod_tip_pos, rod_quat, qpos, qvel, action):
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
reward = 0.
|
reward = 0.
|
||||||
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
||||||
energy_cost = -0.0005 * np.sum(np.square(action))
|
energy_cost = -0.02 * np.sum(np.square(action))
|
||||||
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
||||||
reward += joint_penalty + tcp_box_dist_reward + energy_cost
|
reward += joint_penalty + tcp_box_dist_reward + energy_cost
|
||||||
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
||||||
@ -327,20 +331,23 @@ class BoxPushingTemporalSparse(BoxPushingEnvBase):
|
|||||||
box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100
|
box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100
|
||||||
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100
|
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100
|
||||||
|
|
||||||
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
|
ep_end_joint_vel = -50. * np.linalg.norm(qvel)
|
||||||
|
|
||||||
|
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + ep_end_joint_vel
|
||||||
|
|
||||||
return reward
|
return reward
|
||||||
|
|
||||||
|
|
||||||
class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase):
|
class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase):
|
||||||
|
|
||||||
def __init__(self, frame_skip: int = 10):
|
def __init__(self, frame_skip: int = 10, random_init: bool = False):
|
||||||
super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip)
|
super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip, random_init=random_init)
|
||||||
|
|
||||||
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
rod_tip_pos, rod_quat, qpos, qvel, action):
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
reward = 0.
|
reward = 0.
|
||||||
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
||||||
energy_cost = -0.0005 * np.sum(np.square(action))
|
energy_cost = -0.02 * np.sum(np.square(action))
|
||||||
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
||||||
reward += joint_penalty + tcp_box_dist_reward + energy_cost
|
reward += joint_penalty + tcp_box_dist_reward + energy_cost
|
||||||
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
||||||
@ -360,3 +367,84 @@ class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase):
|
|||||||
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
|
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
|
||||||
|
|
||||||
return reward
|
return reward
|
||||||
|
|
||||||
|
class BoxPushingTemporalSpatialSparse2(BoxPushingEnvBase):
|
||||||
|
|
||||||
|
def __init__(self, frame_skip: int = 10):
|
||||||
|
super(BoxPushingTemporalSpatialSparse2, self).__init__(frame_skip=frame_skip)
|
||||||
|
|
||||||
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
|
reward = 0.
|
||||||
|
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
||||||
|
energy_cost = -0.0005 * np.sum(np.square(action))
|
||||||
|
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
||||||
|
|
||||||
|
reward += joint_penalty + energy_cost + tcp_box_dist_reward
|
||||||
|
|
||||||
|
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
||||||
|
|
||||||
|
if rod_inclined_angle > np.pi / 4:
|
||||||
|
reward -= rod_inclined_angle / (np.pi)
|
||||||
|
|
||||||
|
if not episode_end:
|
||||||
|
return reward
|
||||||
|
|
||||||
|
# Force the robot to stop at the end
|
||||||
|
reward += -50. * np.linalg.norm(qvel)
|
||||||
|
|
||||||
|
box_goal_dist = np.linalg.norm(box_pos - target_pos)
|
||||||
|
|
||||||
|
if box_goal_dist < 0.1:
|
||||||
|
box_goal_pos_dist_reward = np.clip(- 350. * box_goal_dist, -200, 0)
|
||||||
|
box_goal_rot_dist_reward = np.clip(- rotation_distance(box_quat, target_quat)/np.pi * 100., -100, 0)
|
||||||
|
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
|
||||||
|
else:
|
||||||
|
reward -= 300.
|
||||||
|
|
||||||
|
return reward
|
||||||
|
|
||||||
|
|
||||||
|
class BoxPushingBruceSparse(BoxPushingEnvBase):
|
||||||
|
def __init__(self, frame_skip: int = 10):
|
||||||
|
super(BoxPushingBruceSparse, self).__init__(frame_skip=frame_skip)
|
||||||
|
|
||||||
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
|
reward = 0.
|
||||||
|
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
||||||
|
energy_cost = -0.0005 * np.sum(np.square(action))
|
||||||
|
reward += joint_penalty + energy_cost
|
||||||
|
|
||||||
|
if not episode_end:
|
||||||
|
return reward
|
||||||
|
|
||||||
|
box_goal_dist = np.linalg.norm(box_pos - target_pos)
|
||||||
|
|
||||||
|
box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100
|
||||||
|
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100
|
||||||
|
|
||||||
|
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward + self._get_end_vel_penalty()
|
||||||
|
|
||||||
|
return reward
|
||||||
|
|
||||||
|
def _get_end_vel_penalty(self):
|
||||||
|
rot_coeff = 150.
|
||||||
|
pos_coeff = 150.
|
||||||
|
box_rot_pos_vel = self._get_box_vel()
|
||||||
|
box_rot_vel = box_rot_pos_vel[:3]
|
||||||
|
box_pos_vel = box_rot_pos_vel[3:]
|
||||||
|
return -rot_coeff * np.linalg.norm(box_rot_vel) - pos_coeff * np.linalg.norm(box_pos_vel)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__=="__main__":
|
||||||
|
import fancy_gym
|
||||||
|
env = fancy_gym.make("BoxPushingDenseProDMP-v0", seed=0)
|
||||||
|
env.reset()
|
||||||
|
for i in range(1000):
|
||||||
|
env.render(mode="human")
|
||||||
|
obs, rew, done, info = env.step(env.action_space.sample())
|
||||||
|
if done:
|
||||||
|
env.reset()
|
||||||
|
# print(f"box_end_velocity: {info['box_end_vel']}")
|
||||||
|
@ -10,6 +10,17 @@ class MPWrapper(RawInterfaceWrapper):
|
|||||||
# Random x goal + random init pos
|
# Random x goal + random init pos
|
||||||
@property
|
@property
|
||||||
def context_mask(self):
|
def context_mask(self):
|
||||||
|
if self.random_init:
|
||||||
|
return np.hstack([
|
||||||
|
[True] * 7, # joints position
|
||||||
|
[False] * 7, # joints velocity
|
||||||
|
[True] * 3, # position of box
|
||||||
|
[True] * 4, # orientation of box
|
||||||
|
[True] * 3, # position of target
|
||||||
|
[True] * 4, # orientation of target
|
||||||
|
# [True] * 1, # time
|
||||||
|
])
|
||||||
|
|
||||||
return np.hstack([
|
return np.hstack([
|
||||||
[False] * 7, # joints position
|
[False] * 7, # joints position
|
||||||
[False] * 7, # joints velocity
|
[False] * 7, # joints velocity
|
||||||
|
Loading…
Reference in New Issue
Block a user