modified according to comments

This commit is contained in:
Hongyi Zhou 2022-10-14 16:50:57 +02:00
parent 1fd4a1e848
commit bdd51ba61f
5 changed files with 148 additions and 183 deletions

View File

@ -16,7 +16,8 @@ from .mujoco.hopper_throw.hopper_throw import MAX_EPISODE_STEPS_HOPPERTHROW
from .mujoco.hopper_throw.hopper_throw_in_basket import MAX_EPISODE_STEPS_HOPPERTHROWINBASKET from .mujoco.hopper_throw.hopper_throw_in_basket import MAX_EPISODE_STEPS_HOPPERTHROWINBASKET
from .mujoco.reacher.reacher import ReacherEnv, MAX_EPISODE_STEPS_REACHER from .mujoco.reacher.reacher import ReacherEnv, MAX_EPISODE_STEPS_REACHER
from .mujoco.walker_2d_jump.walker_2d_jump import MAX_EPISODE_STEPS_WALKERJUMP from .mujoco.walker_2d_jump.walker_2d_jump import MAX_EPISODE_STEPS_WALKERJUMP
from .mujoco.box_pushing.box_pushing_env import BoxPushingEnv, MAX_EPISODE_STEPS_BOX_PUSHING from .mujoco.box_pushing.box_pushing_env import BoxPushingDense, BoxPushingTemporalSparse, \
BoxPushingTemporalSpatialSparse, MAX_EPISODE_STEPS_BOX_PUSHING
ALL_FANCY_MOVEMENT_PRIMITIVE_ENVIRONMENTS = {"DMP": [], "ProMP": [], "ProDMP": []} ALL_FANCY_MOVEMENT_PRIMITIVE_ENVIRONMENTS = {"DMP": [], "ProMP": [], "ProDMP": []}
@ -232,10 +233,9 @@ register(
for reward_type in ["Dense", "TemporalSparse", "TemporalSpatialSparse"]: for reward_type in ["Dense", "TemporalSparse", "TemporalSpatialSparse"]:
register( register(
id='BoxPushing{}-v0'.format(reward_type), id='BoxPushing{}-v0'.format(reward_type),
entry_point='fancy_gym.envs.mujoco:BoxPushingEnv', entry_point='fancy_gym.envs.mujoco:BoxPushing{}'.format(reward_type),
max_episode_steps=MAX_EPISODE_STEPS_BOX_PUSHING//10, # divided by frames skip max_episode_steps=MAX_EPISODE_STEPS_BOX_PUSHING//10, # divided by frames skip
kwargs={ kwargs={
"reward_type": reward_type,
"frame_skip": 10 "frame_skip": 10
} }
) )

View File

@ -7,4 +7,4 @@ from .hopper_throw.hopper_throw import HopperThrowEnv
from .hopper_throw.hopper_throw_in_basket import HopperThrowInBasketEnv from .hopper_throw.hopper_throw_in_basket import HopperThrowInBasketEnv
from .reacher.reacher import ReacherEnv from .reacher.reacher import ReacherEnv
from .walker_2d_jump.walker_2d_jump import Walker2dJumpEnv from .walker_2d_jump.walker_2d_jump import Walker2dJumpEnv
from .box_pushing.box_pushing_env import BoxPushingEnv from .box_pushing.box_pushing_env import BoxPushingDense, BoxPushingTemporalSparse, BoxPushingTemporalSpatialSparse

View File

@ -5,7 +5,7 @@ from gym import utils, spaces
from gym.envs.mujoco import MujocoEnv from gym.envs.mujoco import MujocoEnv
from fancy_gym.envs.mujoco.box_pushing.box_pushing_utils import rot_to_quat, get_quaternion_error, rotation_distance from fancy_gym.envs.mujoco.box_pushing.box_pushing_utils import rot_to_quat, get_quaternion_error, rotation_distance
from fancy_gym.envs.mujoco.box_pushing.box_pushing_utils import q_max, q_min, q_dot_max, q_torque_max from fancy_gym.envs.mujoco.box_pushing.box_pushing_utils import q_max, q_min, q_dot_max, q_torque_max
from fancy_gym.envs.mujoco.box_pushing.box_pushing_utils import BoxPushingReward from fancy_gym.envs.mujoco.box_pushing.box_pushing_utils import desired_rod_quat
import mujoco import mujoco
@ -13,7 +13,7 @@ MAX_EPISODE_STEPS_BOX_PUSHING = 1000
BOX_POS_BOUND = np.array([[0.3, -0.45, -0.01], [0.6, 0.45, -0.01]]) BOX_POS_BOUND = np.array([[0.3, -0.45, -0.01], [0.6, 0.45, -0.01]])
class BoxPushingEnv(MujocoEnv, utils.EzPickle): class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
""" """
franka box pushing environment franka box pushing environment
action space: action space:
@ -26,14 +26,18 @@ class BoxPushingEnv(MujocoEnv, utils.EzPickle):
3. time-spatial-depend sparse reward 3. time-spatial-depend sparse reward
""" """
def __init__(self, reward_type: str = "Dense", frame_skip: int = 10): def __init__(self, frame_skip: int = 10):
utils.EzPickle.__init__(**locals()) utils.EzPickle.__init__(**locals())
self._steps = 0 self._steps = 0
self.init_qpos_box_pushing = np.array([0., 0., 0., -1.5, 0., 1.5, 0., 0., 0., 0.6, 0.45, 0.0, 1., 0., 0., 0.]) self.init_qpos_box_pushing = np.array([0., 0., 0., -1.5, 0., 1.5, 0., 0., 0., 0.6, 0.45, 0.0, 1., 0., 0., 0.])
self.init_qvel_box_pushing = np.zeros(15) self.init_qvel_box_pushing = np.zeros(15)
self.frame_skip = frame_skip self.frame_skip = frame_skip
assert reward_type in ["Dense", "TemporalSparse", "TemporalSpatialSparse"], "unrecognized reward type"
self.reward = BoxPushingReward(reward_type, q_max, q_min, q_dot_max) self._q_max = q_max
self._q_min = q_min
self._q_dot_max = q_dot_max
self._desired_rod_quat = desired_rod_quat
self._episode_energy = 0. self._episode_energy = 0.
MujocoEnv.__init__(self, MujocoEnv.__init__(self,
model_path=os.path.join(os.path.dirname(__file__), "assets", "box_pushing.xml"), model_path=os.path.join(os.path.dirname(__file__), "assets", "box_pushing.xml"),
@ -68,8 +72,8 @@ class BoxPushingEnv(MujocoEnv, utils.EzPickle):
qvel = self.data.qvel[:7].copy() qvel = self.data.qvel[:7].copy()
if not unstable_simulation: if not unstable_simulation:
reward = self.reward.get_reward(episode_end, box_pos, box_quat, target_pos, target_quat, reward = self._get_reward(episode_end, box_pos, box_quat, target_pos, target_quat,
rod_tip_pos, rod_quat, qpos, qvel, action) rod_tip_pos, rod_quat, qpos, qvel, action)
else: else:
reward = -50 reward = -50
@ -96,9 +100,9 @@ class BoxPushingEnv(MujocoEnv, utils.EzPickle):
box_target_pos = self.sample_context() box_target_pos = self.sample_context()
while np.linalg.norm(box_target_pos[:2] - box_init_pos[:2]) < 0.3: while np.linalg.norm(box_target_pos[:2] - box_init_pos[:2]) < 0.3:
box_target_pos = self.sample_context() box_target_pos = self.sample_context()
#box_target_pos[0] = 0.4 # box_target_pos[0] = 0.4
#box_target_pos[1] = -0.3 # box_target_pos[1] = -0.3
#box_target_pos[-4:] = np.array([0.0, 0.0, 0.0, 1.0]) # box_target_pos[-4:] = np.array([0.0, 0.0, 0.0, 1.0])
self.model.body_pos[2] = box_target_pos[:3] self.model.body_pos[2] = box_target_pos[:3]
self.model.body_quat[2] = box_target_pos[-4:] self.model.body_quat[2] = box_target_pos[-4:]
self.model.body_pos[3] = box_target_pos[:3] self.model.body_pos[3] = box_target_pos[:3]
@ -122,6 +126,10 @@ class BoxPushingEnv(MujocoEnv, utils.EzPickle):
quat = rot_to_quat(theta, np.array([0, 0, 1])) quat = rot_to_quat(theta, np.array([0, 0, 1]))
return np.concatenate([pos, quat]) return np.concatenate([pos, quat])
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
rod_tip_pos, rod_quat, qpos, qvel, action):
raise NotImplementedError
def _get_obs(self): def _get_obs(self):
obs = np.concatenate([ obs = np.concatenate([
self.data.qpos[:7].copy(), # joint position self.data.qpos[:7].copy(), # joint position
@ -136,6 +144,22 @@ class BoxPushingEnv(MujocoEnv, utils.EzPickle):
]) ])
return obs return obs
def _joint_limit_violate_penalty(self, qpos, qvel, enable_pos_limit=False, enable_vel_limit=False):
penalty = 0.
p_coeff = 1.
v_coeff = 1.
# q_limit
if enable_pos_limit:
higher_error = qpos - self._q_max
lower_error = self._q_min - qpos
penalty -= p_coeff * (abs(np.sum(higher_error[qpos > self._q_max])) +
abs(np.sum(lower_error[qpos < self._q_min])))
# q_dot_limit
if enable_vel_limit:
q_dot_error = abs(qvel) - abs(self._q_dot_max)
penalty -= v_coeff * abs(np.sum(q_dot_error[q_dot_error > 0.]))
return penalty
def get_body_jacp(self, name): def get_body_jacp(self, name):
id = mujoco.mj_name2id(self.model, 1, name) id = mujoco.mj_name2id(self.model, 1, name)
jacp = np.zeros((3, self.model.nv)) jacp = np.zeros((3, self.model.nv))
@ -252,8 +276,89 @@ class BoxPushingEnv(MujocoEnv, utils.EzPickle):
return q return q
class BoxPushingDense(BoxPushingEnvBase):
def __init__(self, frame_skip: int = 10):
super(BoxPushingDense, self).__init__(frame_skip=frame_skip)
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
rod_tip_pos, rod_quat, qpos, qvel, action):
joint_penalty = self._joint_limit_violate_penalty(qpos,
qvel,
enable_pos_limit=True,
enable_vel_limit=True)
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
box_goal_pos_dist_reward = -3.5 * np.linalg.norm(box_pos - target_pos)
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi
energy_cost = -0.0005 * np.sum(np.square(action))
reward = joint_penalty + tcp_box_dist_reward + \
box_goal_pos_dist_reward + box_goal_rot_dist_reward + energy_cost
rod_inclined_angle = rotation_distance(rod_quat, self._desired_rod_quat)
if rod_inclined_angle > np.pi / 4:
reward -= rod_inclined_angle / (np.pi)
return reward
class BoxPushingTemporalSparse(BoxPushingEnvBase):
def __init__(self, frame_skip: int = 10):
super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip)
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
rod_tip_pos, rod_quat, qpos, qvel, action):
reward = 0.
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
energy_cost = -0.0005 * np.sum(np.square(action))
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
reward += joint_penalty + tcp_box_dist_reward + energy_cost
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
if rod_inclined_angle > np.pi / 4:
reward -= rod_inclined_angle / (np.pi)
if not episode_end:
return reward
box_goal_dist = np.linalg.norm(box_pos - target_pos)
box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
return reward
class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase):
def __init__(self, frame_skip: int = 10):
super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip)
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
rod_tip_pos, rod_quat, qpos, qvel, action):
reward = 0.
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
energy_cost = -0.0005 * np.sum(np.square(action))
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
reward += joint_penalty + tcp_box_dist_reward + energy_cost
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
if rod_inclined_angle > np.pi / 4:
reward -= rod_inclined_angle / (np.pi)
if not episode_end:
return reward
box_goal_dist = np.linalg.norm(box_pos - target_pos)
if box_goal_dist < 0.1:
reward += 300
box_goal_pos_dist_reward = np.clip(- 3.5 * box_goal_dist * 100 * 3, -100, 0)
box_goal_rot_dist_reward = np.clip(- rotation_distance(box_quat, target_quat)/np.pi * 100 * 1.5, -100, 0)
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
return reward
if __name__=="__main__": if __name__=="__main__":
env = BoxPushingEnv(reward_type="dense", frame_skip=10) env = BoxPushingTemporalSpatialSparse(frame_skip=10)
env.reset() env.reset()
for i in range(100): for i in range(100):
env.reset() env.reset()

View File

@ -10,184 +10,44 @@ q_torque_max = np.array([90., 90., 90., 90., 12., 12., 12.])
# #
desired_rod_quat = np.array([0.0, 1.0, 0.0, 0.0]) desired_rod_quat = np.array([0.0, 1.0, 0.0, 0.0])
def skew(x):
"""
Returns the skew-symmetric matrix of x
param x: 3x1 vector
"""
return np.array([[0, -x[2], x[1]], [x[2], 0, -x[0]], [-x[1], x[0], 0]])
def get_quaternion_error(curr_quat, des_quat): def get_quaternion_error(curr_quat, des_quat):
""" """
Calculates the difference between the current quaternion and the desired quaternion. Calculates the difference between the current quaternion and the desired quaternion.
See Siciliano textbook page 140 Eq 3.91 See Siciliano textbook page 140 Eq 3.91
:param curr_quat: current quaternion param curr_quat: current quaternion
:param des_quat: desired quaternion param des_quat: desired quaternion
:return: difference between current quaternion and desired quaternion return: difference between current quaternion and desired quaternion
""" """
quatError = np.zeros((3, )) return curr_quat[0] * des_quat[1:] - des_quat[0] * curr_quat[1:] - skew(des_quat[1:]) @ curr_quat[1:]
quatError[0] = (curr_quat[0] * des_quat[1] - des_quat[0] * curr_quat[1] -
curr_quat[3] * des_quat[2] + curr_quat[2] * des_quat[3])
quatError[1] = (curr_quat[0] * des_quat[2] - des_quat[0] * curr_quat[2] +
curr_quat[3] * des_quat[1] - curr_quat[1] * des_quat[3])
quatError[2] = (curr_quat[0] * des_quat[3] - des_quat[0] * curr_quat[3] -
curr_quat[2] * des_quat[1] + curr_quat[1] * des_quat[2])
return quatError
def rotation_distance(p: np.array, q: np.array): def rotation_distance(p: np.array, q: np.array):
""" """
p: quaternion Calculates the rotation angular between two quaternions
q: quaternion param p: quaternion
param q: quaternion
theta: rotation angle between p and q (rad) theta: rotation angle between p and q (rad)
""" """
assert p.shape == q.shape, "p and q should be quaternion" assert p.shape == q.shape, "p and q should be quaternion"
product = p[0] * q[0] + p[1] * q[1] + p[2] * q[2] + p[3] * q[3] theta = 2 * np.arccos(abs(p @ q))
theta = 2 * np.arccos(abs(product))
return theta return theta
def rot_to_quat(theta, axis): def rot_to_quat(theta, axis):
"""
Converts rotation angle along an axis to quaternion
param theta: rotation angle (rad)
param axis: rotation axis
return: quaternion
"""
quant = np.zeros(4) quant = np.zeros(4)
quant[0] = np.sin(theta / 2.) quant[0] = np.sin(theta / 2.)
quant[1] = np.cos(theta / 2.) * axis[0] quant[1:] = np.cos(theta / 2.) * axis
quant[2] = np.cos(theta / 2.) * axis[1]
quant[3] = np.cos(theta / 2.) * axis[2]
return quant return quant
class RewardBase():
def __init__(self, q_max, q_min, q_dot_max):
self._reward = 0.
self._done = False
self._q_max = q_max
self._q_min = q_min
self._q_dot_max = q_dot_max
def get_reward(self, episodic_end, box_pos, box_quat, target_pos, target_quat,
rod_tip_pos, rod_quat, qpos, qvel, action):
raise NotImplementedError
def _joint_limit_violate_penalty(self,
qpos,
qvel,
enable_pos_limit=False,
enable_vel_limit=False):
penalty = 0.
p_coeff = 1.
v_coeff = 1.
# q_limit
if enable_pos_limit:
higher_indice = np.where(qpos > self._q_max)
lower_indice = np.where(qpos < self._q_min)
higher_error = qpos - self._q_max
lower_error = self._q_min - qpos
penalty -= p_coeff * (abs(np.sum(higher_error[higher_indice])) +
abs(np.sum(lower_error[lower_indice])))
# q_dot_limit
if enable_vel_limit:
q_dot_error = abs(qvel) - abs(self._q_dot_max)
q_dot_violate_idx = np.where(q_dot_error > 0.)
penalty -= v_coeff * abs(np.sum(q_dot_error[q_dot_violate_idx]))
return penalty
class DenseReward(RewardBase):
def __init__(self, q_max, q_min, q_dot_max):
super(DenseReward, self).__init__(q_max, q_min, q_dot_max)
def get_reward(self, episodic_end, box_pos, box_quat, target_pos, target_quat,
rod_tip_pos, rod_quat, qpos, qvel, action):
joint_penalty = self._joint_limit_violate_penalty(qpos,
qvel,
enable_pos_limit=True,
enable_vel_limit=True)
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
box_goal_pos_dist_reward = -3.5 * np.linalg.norm(box_pos - target_pos)
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi
energy_cost = -0.0005 * np.sum(np.square(action))
reward = joint_penalty + tcp_box_dist_reward + \
box_goal_pos_dist_reward + box_goal_rot_dist_reward + energy_cost
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
if rod_inclined_angle > np.pi / 4:
reward -= rod_inclined_angle / (np.pi)
return reward
class TemporalSparseReward(RewardBase):
def __init__(self, q_max, q_min, q_dot_max):
super(TemporalSparseReward, self).__init__(q_max, q_min, q_dot_max)
def get_reward(self, episodic_end, box_pos, box_quat, target_pos, target_quat,
rod_tip_pos, rod_quat, qpos, qvel, action):
reward = 0.
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
energy_cost = -0.0005 * np.sum(np.square(action))
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
reward += joint_penalty + tcp_box_dist_reward + energy_cost
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
if rod_inclined_angle > np.pi / 4:
reward -= rod_inclined_angle / (np.pi)
if not episodic_end:
return reward
box_goal_dist = np.linalg.norm(box_pos - target_pos)
box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
return reward
class TemporalSpatialSparseReward(RewardBase):
def __init__(self, q_max, q_min, q_dot_max):
super(TemporalSpatialSparseReward, self).__init__(q_max, q_min, q_dot_max)
def get_reward(self, episodic_end, box_pos, box_quat, target_pos, target_quat,
rod_tip_pos, rod_quat, qpos, qvel, action):
reward = 0.
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
energy_cost = -0.0005 * np.sum(np.square(action))
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
reward += joint_penalty + tcp_box_dist_reward + energy_cost
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
if rod_inclined_angle > np.pi / 4:
reward -= rod_inclined_angle / (np.pi)
if not episodic_end:
return reward
box_goal_dist = np.linalg.norm(box_pos - target_pos)
if box_goal_dist < 0.1:
reward += 300
box_goal_pos_dist_reward = np.clip(- 3.5 * box_goal_dist * 100 * 3, -100, 0)
box_goal_rot_dist_reward = np.clip(- rotation_distance(box_quat, target_quat)/np.pi * 100 * 1.5, -100, 0)
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
return reward
def BoxPushingReward(reward_type, q_max, q_min, q_dot_max):
if reward_type == 'Dense':
return DenseReward(q_max, q_min, q_dot_max)
elif reward_type == 'TemporalSparse':
return TemporalSparseReward(q_max, q_min, q_dot_max)
elif reward_type == 'TemporalSpatialSparse':
return TemporalSpatialSparseReward(q_max, q_min, q_dot_max)
else:
raise NotImplementedError

View File

@ -157,17 +157,17 @@ def example_fully_custom_mp(seed=1, iterations=1, render=True):
if __name__ == '__main__': if __name__ == '__main__':
render = True render = True
# DMP # DMP
# example_mp("HoleReacherDMP-v0", seed=10, iterations=5, render=render) example_mp("HoleReacherDMP-v0", seed=10, iterations=5, render=render)
# ProMP # ProMP
# example_mp("HoleReacherProMP-v0", seed=10, iterations=5, render=render) example_mp("HoleReacherProMP-v0", seed=10, iterations=5, render=render)
# example_mp("BoxPushingDenseProMP-v0", seed=10, iterations=50, render=render) example_mp("BoxPushingTemporalSparseProMP-v0", seed=10, iterations=1, render=render)
# ProDMP # ProDMP
example_mp("BoxPushingDenseProDMP-v0", seed=10, iterations=50, render=render) example_mp("BoxPushingDenseProDMP-v0", seed=10, iterations=1, render=render)
# Altered basis functions # Altered basis functions
# obs1 = example_custom_mp("Reacher5dProMP-v0", seed=10, iterations=1, render=render) obs1 = example_custom_mp("Reacher5dProMP-v0", seed=10, iterations=1, render=render)
# Custom MP # Custom MP
# example_fully_custom_mp(seed=10, iterations=1, render=render) example_fully_custom_mp(seed=10, iterations=1, render=render)