modified according to comments
This commit is contained in:
parent
1fd4a1e848
commit
bdd51ba61f
@ -16,7 +16,8 @@ from .mujoco.hopper_throw.hopper_throw import MAX_EPISODE_STEPS_HOPPERTHROW
|
|||||||
from .mujoco.hopper_throw.hopper_throw_in_basket import MAX_EPISODE_STEPS_HOPPERTHROWINBASKET
|
from .mujoco.hopper_throw.hopper_throw_in_basket import MAX_EPISODE_STEPS_HOPPERTHROWINBASKET
|
||||||
from .mujoco.reacher.reacher import ReacherEnv, MAX_EPISODE_STEPS_REACHER
|
from .mujoco.reacher.reacher import ReacherEnv, MAX_EPISODE_STEPS_REACHER
|
||||||
from .mujoco.walker_2d_jump.walker_2d_jump import MAX_EPISODE_STEPS_WALKERJUMP
|
from .mujoco.walker_2d_jump.walker_2d_jump import MAX_EPISODE_STEPS_WALKERJUMP
|
||||||
from .mujoco.box_pushing.box_pushing_env import BoxPushingEnv, MAX_EPISODE_STEPS_BOX_PUSHING
|
from .mujoco.box_pushing.box_pushing_env import BoxPushingDense, BoxPushingTemporalSparse, \
|
||||||
|
BoxPushingTemporalSpatialSparse, MAX_EPISODE_STEPS_BOX_PUSHING
|
||||||
|
|
||||||
ALL_FANCY_MOVEMENT_PRIMITIVE_ENVIRONMENTS = {"DMP": [], "ProMP": [], "ProDMP": []}
|
ALL_FANCY_MOVEMENT_PRIMITIVE_ENVIRONMENTS = {"DMP": [], "ProMP": [], "ProDMP": []}
|
||||||
|
|
||||||
@ -232,10 +233,9 @@ register(
|
|||||||
for reward_type in ["Dense", "TemporalSparse", "TemporalSpatialSparse"]:
|
for reward_type in ["Dense", "TemporalSparse", "TemporalSpatialSparse"]:
|
||||||
register(
|
register(
|
||||||
id='BoxPushing{}-v0'.format(reward_type),
|
id='BoxPushing{}-v0'.format(reward_type),
|
||||||
entry_point='fancy_gym.envs.mujoco:BoxPushingEnv',
|
entry_point='fancy_gym.envs.mujoco:BoxPushing{}'.format(reward_type),
|
||||||
max_episode_steps=MAX_EPISODE_STEPS_BOX_PUSHING//10, # divided by frames skip
|
max_episode_steps=MAX_EPISODE_STEPS_BOX_PUSHING//10, # divided by frames skip
|
||||||
kwargs={
|
kwargs={
|
||||||
"reward_type": reward_type,
|
|
||||||
"frame_skip": 10
|
"frame_skip": 10
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
@ -7,4 +7,4 @@ from .hopper_throw.hopper_throw import HopperThrowEnv
|
|||||||
from .hopper_throw.hopper_throw_in_basket import HopperThrowInBasketEnv
|
from .hopper_throw.hopper_throw_in_basket import HopperThrowInBasketEnv
|
||||||
from .reacher.reacher import ReacherEnv
|
from .reacher.reacher import ReacherEnv
|
||||||
from .walker_2d_jump.walker_2d_jump import Walker2dJumpEnv
|
from .walker_2d_jump.walker_2d_jump import Walker2dJumpEnv
|
||||||
from .box_pushing.box_pushing_env import BoxPushingEnv
|
from .box_pushing.box_pushing_env import BoxPushingDense, BoxPushingTemporalSparse, BoxPushingTemporalSpatialSparse
|
||||||
|
@ -5,7 +5,7 @@ from gym import utils, spaces
|
|||||||
from gym.envs.mujoco import MujocoEnv
|
from gym.envs.mujoco import MujocoEnv
|
||||||
from fancy_gym.envs.mujoco.box_pushing.box_pushing_utils import rot_to_quat, get_quaternion_error, rotation_distance
|
from fancy_gym.envs.mujoco.box_pushing.box_pushing_utils import rot_to_quat, get_quaternion_error, rotation_distance
|
||||||
from fancy_gym.envs.mujoco.box_pushing.box_pushing_utils import q_max, q_min, q_dot_max, q_torque_max
|
from fancy_gym.envs.mujoco.box_pushing.box_pushing_utils import q_max, q_min, q_dot_max, q_torque_max
|
||||||
from fancy_gym.envs.mujoco.box_pushing.box_pushing_utils import BoxPushingReward
|
from fancy_gym.envs.mujoco.box_pushing.box_pushing_utils import desired_rod_quat
|
||||||
|
|
||||||
import mujoco
|
import mujoco
|
||||||
|
|
||||||
@ -13,7 +13,7 @@ MAX_EPISODE_STEPS_BOX_PUSHING = 1000
|
|||||||
|
|
||||||
BOX_POS_BOUND = np.array([[0.3, -0.45, -0.01], [0.6, 0.45, -0.01]])
|
BOX_POS_BOUND = np.array([[0.3, -0.45, -0.01], [0.6, 0.45, -0.01]])
|
||||||
|
|
||||||
class BoxPushingEnv(MujocoEnv, utils.EzPickle):
|
class BoxPushingEnvBase(MujocoEnv, utils.EzPickle):
|
||||||
"""
|
"""
|
||||||
franka box pushing environment
|
franka box pushing environment
|
||||||
action space:
|
action space:
|
||||||
@ -26,14 +26,18 @@ class BoxPushingEnv(MujocoEnv, utils.EzPickle):
|
|||||||
3. time-spatial-depend sparse reward
|
3. time-spatial-depend sparse reward
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, reward_type: str = "Dense", frame_skip: int = 10):
|
def __init__(self, frame_skip: int = 10):
|
||||||
utils.EzPickle.__init__(**locals())
|
utils.EzPickle.__init__(**locals())
|
||||||
self._steps = 0
|
self._steps = 0
|
||||||
self.init_qpos_box_pushing = np.array([0., 0., 0., -1.5, 0., 1.5, 0., 0., 0., 0.6, 0.45, 0.0, 1., 0., 0., 0.])
|
self.init_qpos_box_pushing = np.array([0., 0., 0., -1.5, 0., 1.5, 0., 0., 0., 0.6, 0.45, 0.0, 1., 0., 0., 0.])
|
||||||
self.init_qvel_box_pushing = np.zeros(15)
|
self.init_qvel_box_pushing = np.zeros(15)
|
||||||
self.frame_skip = frame_skip
|
self.frame_skip = frame_skip
|
||||||
assert reward_type in ["Dense", "TemporalSparse", "TemporalSpatialSparse"], "unrecognized reward type"
|
|
||||||
self.reward = BoxPushingReward(reward_type, q_max, q_min, q_dot_max)
|
self._q_max = q_max
|
||||||
|
self._q_min = q_min
|
||||||
|
self._q_dot_max = q_dot_max
|
||||||
|
self._desired_rod_quat = desired_rod_quat
|
||||||
|
|
||||||
self._episode_energy = 0.
|
self._episode_energy = 0.
|
||||||
MujocoEnv.__init__(self,
|
MujocoEnv.__init__(self,
|
||||||
model_path=os.path.join(os.path.dirname(__file__), "assets", "box_pushing.xml"),
|
model_path=os.path.join(os.path.dirname(__file__), "assets", "box_pushing.xml"),
|
||||||
@ -68,7 +72,7 @@ class BoxPushingEnv(MujocoEnv, utils.EzPickle):
|
|||||||
qvel = self.data.qvel[:7].copy()
|
qvel = self.data.qvel[:7].copy()
|
||||||
|
|
||||||
if not unstable_simulation:
|
if not unstable_simulation:
|
||||||
reward = self.reward.get_reward(episode_end, box_pos, box_quat, target_pos, target_quat,
|
reward = self._get_reward(episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
rod_tip_pos, rod_quat, qpos, qvel, action)
|
rod_tip_pos, rod_quat, qpos, qvel, action)
|
||||||
else:
|
else:
|
||||||
reward = -50
|
reward = -50
|
||||||
@ -122,6 +126,10 @@ class BoxPushingEnv(MujocoEnv, utils.EzPickle):
|
|||||||
quat = rot_to_quat(theta, np.array([0, 0, 1]))
|
quat = rot_to_quat(theta, np.array([0, 0, 1]))
|
||||||
return np.concatenate([pos, quat])
|
return np.concatenate([pos, quat])
|
||||||
|
|
||||||
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def _get_obs(self):
|
def _get_obs(self):
|
||||||
obs = np.concatenate([
|
obs = np.concatenate([
|
||||||
self.data.qpos[:7].copy(), # joint position
|
self.data.qpos[:7].copy(), # joint position
|
||||||
@ -136,6 +144,22 @@ class BoxPushingEnv(MujocoEnv, utils.EzPickle):
|
|||||||
])
|
])
|
||||||
return obs
|
return obs
|
||||||
|
|
||||||
|
def _joint_limit_violate_penalty(self, qpos, qvel, enable_pos_limit=False, enable_vel_limit=False):
|
||||||
|
penalty = 0.
|
||||||
|
p_coeff = 1.
|
||||||
|
v_coeff = 1.
|
||||||
|
# q_limit
|
||||||
|
if enable_pos_limit:
|
||||||
|
higher_error = qpos - self._q_max
|
||||||
|
lower_error = self._q_min - qpos
|
||||||
|
penalty -= p_coeff * (abs(np.sum(higher_error[qpos > self._q_max])) +
|
||||||
|
abs(np.sum(lower_error[qpos < self._q_min])))
|
||||||
|
# q_dot_limit
|
||||||
|
if enable_vel_limit:
|
||||||
|
q_dot_error = abs(qvel) - abs(self._q_dot_max)
|
||||||
|
penalty -= v_coeff * abs(np.sum(q_dot_error[q_dot_error > 0.]))
|
||||||
|
return penalty
|
||||||
|
|
||||||
def get_body_jacp(self, name):
|
def get_body_jacp(self, name):
|
||||||
id = mujoco.mj_name2id(self.model, 1, name)
|
id = mujoco.mj_name2id(self.model, 1, name)
|
||||||
jacp = np.zeros((3, self.model.nv))
|
jacp = np.zeros((3, self.model.nv))
|
||||||
@ -252,8 +276,89 @@ class BoxPushingEnv(MujocoEnv, utils.EzPickle):
|
|||||||
|
|
||||||
return q
|
return q
|
||||||
|
|
||||||
|
class BoxPushingDense(BoxPushingEnvBase):
|
||||||
|
def __init__(self, frame_skip: int = 10):
|
||||||
|
super(BoxPushingDense, self).__init__(frame_skip=frame_skip)
|
||||||
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
|
joint_penalty = self._joint_limit_violate_penalty(qpos,
|
||||||
|
qvel,
|
||||||
|
enable_pos_limit=True,
|
||||||
|
enable_vel_limit=True)
|
||||||
|
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
||||||
|
box_goal_pos_dist_reward = -3.5 * np.linalg.norm(box_pos - target_pos)
|
||||||
|
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi
|
||||||
|
energy_cost = -0.0005 * np.sum(np.square(action))
|
||||||
|
|
||||||
|
reward = joint_penalty + tcp_box_dist_reward + \
|
||||||
|
box_goal_pos_dist_reward + box_goal_rot_dist_reward + energy_cost
|
||||||
|
|
||||||
|
rod_inclined_angle = rotation_distance(rod_quat, self._desired_rod_quat)
|
||||||
|
if rod_inclined_angle > np.pi / 4:
|
||||||
|
reward -= rod_inclined_angle / (np.pi)
|
||||||
|
|
||||||
|
return reward
|
||||||
|
|
||||||
|
class BoxPushingTemporalSparse(BoxPushingEnvBase):
|
||||||
|
def __init__(self, frame_skip: int = 10):
|
||||||
|
super(BoxPushingTemporalSparse, self).__init__(frame_skip=frame_skip)
|
||||||
|
|
||||||
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
|
reward = 0.
|
||||||
|
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
||||||
|
energy_cost = -0.0005 * np.sum(np.square(action))
|
||||||
|
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
||||||
|
reward += joint_penalty + tcp_box_dist_reward + energy_cost
|
||||||
|
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
||||||
|
|
||||||
|
if rod_inclined_angle > np.pi / 4:
|
||||||
|
reward -= rod_inclined_angle / (np.pi)
|
||||||
|
|
||||||
|
if not episode_end:
|
||||||
|
return reward
|
||||||
|
|
||||||
|
box_goal_dist = np.linalg.norm(box_pos - target_pos)
|
||||||
|
|
||||||
|
box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100
|
||||||
|
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100
|
||||||
|
|
||||||
|
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
|
||||||
|
|
||||||
|
return reward
|
||||||
|
|
||||||
|
class BoxPushingTemporalSpatialSparse(BoxPushingEnvBase):
|
||||||
|
|
||||||
|
def __init__(self, frame_skip: int = 10):
|
||||||
|
super(BoxPushingTemporalSpatialSparse, self).__init__(frame_skip=frame_skip)
|
||||||
|
|
||||||
|
def _get_reward(self, episode_end, box_pos, box_quat, target_pos, target_quat,
|
||||||
|
rod_tip_pos, rod_quat, qpos, qvel, action):
|
||||||
|
reward = 0.
|
||||||
|
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
||||||
|
energy_cost = -0.0005 * np.sum(np.square(action))
|
||||||
|
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
||||||
|
reward += joint_penalty + tcp_box_dist_reward + energy_cost
|
||||||
|
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
||||||
|
|
||||||
|
if rod_inclined_angle > np.pi / 4:
|
||||||
|
reward -= rod_inclined_angle / (np.pi)
|
||||||
|
|
||||||
|
if not episode_end:
|
||||||
|
return reward
|
||||||
|
|
||||||
|
box_goal_dist = np.linalg.norm(box_pos - target_pos)
|
||||||
|
|
||||||
|
if box_goal_dist < 0.1:
|
||||||
|
reward += 300
|
||||||
|
box_goal_pos_dist_reward = np.clip(- 3.5 * box_goal_dist * 100 * 3, -100, 0)
|
||||||
|
box_goal_rot_dist_reward = np.clip(- rotation_distance(box_quat, target_quat)/np.pi * 100 * 1.5, -100, 0)
|
||||||
|
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
|
||||||
|
|
||||||
|
return reward
|
||||||
|
|
||||||
if __name__=="__main__":
|
if __name__=="__main__":
|
||||||
env = BoxPushingEnv(reward_type="dense", frame_skip=10)
|
env = BoxPushingTemporalSpatialSparse(frame_skip=10)
|
||||||
env.reset()
|
env.reset()
|
||||||
for i in range(100):
|
for i in range(100):
|
||||||
env.reset()
|
env.reset()
|
||||||
|
@ -10,184 +10,44 @@ q_torque_max = np.array([90., 90., 90., 90., 12., 12., 12.])
|
|||||||
#
|
#
|
||||||
desired_rod_quat = np.array([0.0, 1.0, 0.0, 0.0])
|
desired_rod_quat = np.array([0.0, 1.0, 0.0, 0.0])
|
||||||
|
|
||||||
|
def skew(x):
|
||||||
|
"""
|
||||||
|
Returns the skew-symmetric matrix of x
|
||||||
|
param x: 3x1 vector
|
||||||
|
"""
|
||||||
|
return np.array([[0, -x[2], x[1]], [x[2], 0, -x[0]], [-x[1], x[0], 0]])
|
||||||
|
|
||||||
def get_quaternion_error(curr_quat, des_quat):
|
def get_quaternion_error(curr_quat, des_quat):
|
||||||
"""
|
"""
|
||||||
Calculates the difference between the current quaternion and the desired quaternion.
|
Calculates the difference between the current quaternion and the desired quaternion.
|
||||||
See Siciliano textbook page 140 Eq 3.91
|
See Siciliano textbook page 140 Eq 3.91
|
||||||
|
|
||||||
:param curr_quat: current quaternion
|
param curr_quat: current quaternion
|
||||||
:param des_quat: desired quaternion
|
param des_quat: desired quaternion
|
||||||
:return: difference between current quaternion and desired quaternion
|
return: difference between current quaternion and desired quaternion
|
||||||
"""
|
"""
|
||||||
quatError = np.zeros((3, ))
|
return curr_quat[0] * des_quat[1:] - des_quat[0] * curr_quat[1:] - skew(des_quat[1:]) @ curr_quat[1:]
|
||||||
|
|
||||||
quatError[0] = (curr_quat[0] * des_quat[1] - des_quat[0] * curr_quat[1] -
|
|
||||||
curr_quat[3] * des_quat[2] + curr_quat[2] * des_quat[3])
|
|
||||||
|
|
||||||
quatError[1] = (curr_quat[0] * des_quat[2] - des_quat[0] * curr_quat[2] +
|
|
||||||
curr_quat[3] * des_quat[1] - curr_quat[1] * des_quat[3])
|
|
||||||
|
|
||||||
quatError[2] = (curr_quat[0] * des_quat[3] - des_quat[0] * curr_quat[3] -
|
|
||||||
curr_quat[2] * des_quat[1] + curr_quat[1] * des_quat[2])
|
|
||||||
|
|
||||||
return quatError
|
|
||||||
|
|
||||||
|
|
||||||
def rotation_distance(p: np.array, q: np.array):
|
def rotation_distance(p: np.array, q: np.array):
|
||||||
"""
|
"""
|
||||||
p: quaternion
|
Calculates the rotation angular between two quaternions
|
||||||
q: quaternion
|
param p: quaternion
|
||||||
|
param q: quaternion
|
||||||
theta: rotation angle between p and q (rad)
|
theta: rotation angle between p and q (rad)
|
||||||
"""
|
"""
|
||||||
assert p.shape == q.shape, "p and q should be quaternion"
|
assert p.shape == q.shape, "p and q should be quaternion"
|
||||||
product = p[0] * q[0] + p[1] * q[1] + p[2] * q[2] + p[3] * q[3]
|
theta = 2 * np.arccos(abs(p @ q))
|
||||||
theta = 2 * np.arccos(abs(product))
|
|
||||||
return theta
|
return theta
|
||||||
|
|
||||||
|
|
||||||
def rot_to_quat(theta, axis):
|
def rot_to_quat(theta, axis):
|
||||||
|
"""
|
||||||
|
Converts rotation angle along an axis to quaternion
|
||||||
|
param theta: rotation angle (rad)
|
||||||
|
param axis: rotation axis
|
||||||
|
return: quaternion
|
||||||
|
"""
|
||||||
quant = np.zeros(4)
|
quant = np.zeros(4)
|
||||||
quant[0] = np.sin(theta / 2.)
|
quant[0] = np.sin(theta / 2.)
|
||||||
quant[1] = np.cos(theta / 2.) * axis[0]
|
quant[1:] = np.cos(theta / 2.) * axis
|
||||||
quant[2] = np.cos(theta / 2.) * axis[1]
|
|
||||||
quant[3] = np.cos(theta / 2.) * axis[2]
|
|
||||||
return quant
|
return quant
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class RewardBase():
|
|
||||||
def __init__(self, q_max, q_min, q_dot_max):
|
|
||||||
self._reward = 0.
|
|
||||||
self._done = False
|
|
||||||
self._q_max = q_max
|
|
||||||
self._q_min = q_min
|
|
||||||
self._q_dot_max = q_dot_max
|
|
||||||
|
|
||||||
def get_reward(self, episodic_end, box_pos, box_quat, target_pos, target_quat,
|
|
||||||
rod_tip_pos, rod_quat, qpos, qvel, action):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def _joint_limit_violate_penalty(self,
|
|
||||||
qpos,
|
|
||||||
qvel,
|
|
||||||
enable_pos_limit=False,
|
|
||||||
enable_vel_limit=False):
|
|
||||||
penalty = 0.
|
|
||||||
p_coeff = 1.
|
|
||||||
v_coeff = 1.
|
|
||||||
# q_limit
|
|
||||||
if enable_pos_limit:
|
|
||||||
higher_indice = np.where(qpos > self._q_max)
|
|
||||||
lower_indice = np.where(qpos < self._q_min)
|
|
||||||
higher_error = qpos - self._q_max
|
|
||||||
lower_error = self._q_min - qpos
|
|
||||||
penalty -= p_coeff * (abs(np.sum(higher_error[higher_indice])) +
|
|
||||||
abs(np.sum(lower_error[lower_indice])))
|
|
||||||
# q_dot_limit
|
|
||||||
if enable_vel_limit:
|
|
||||||
q_dot_error = abs(qvel) - abs(self._q_dot_max)
|
|
||||||
q_dot_violate_idx = np.where(q_dot_error > 0.)
|
|
||||||
penalty -= v_coeff * abs(np.sum(q_dot_error[q_dot_violate_idx]))
|
|
||||||
return penalty
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class DenseReward(RewardBase):
|
|
||||||
def __init__(self, q_max, q_min, q_dot_max):
|
|
||||||
super(DenseReward, self).__init__(q_max, q_min, q_dot_max)
|
|
||||||
|
|
||||||
def get_reward(self, episodic_end, box_pos, box_quat, target_pos, target_quat,
|
|
||||||
rod_tip_pos, rod_quat, qpos, qvel, action):
|
|
||||||
joint_penalty = self._joint_limit_violate_penalty(qpos,
|
|
||||||
qvel,
|
|
||||||
enable_pos_limit=True,
|
|
||||||
enable_vel_limit=True)
|
|
||||||
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
|
||||||
box_goal_pos_dist_reward = -3.5 * np.linalg.norm(box_pos - target_pos)
|
|
||||||
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi
|
|
||||||
energy_cost = -0.0005 * np.sum(np.square(action))
|
|
||||||
|
|
||||||
reward = joint_penalty + tcp_box_dist_reward + \
|
|
||||||
box_goal_pos_dist_reward + box_goal_rot_dist_reward + energy_cost
|
|
||||||
|
|
||||||
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
|
||||||
if rod_inclined_angle > np.pi / 4:
|
|
||||||
reward -= rod_inclined_angle / (np.pi)
|
|
||||||
|
|
||||||
return reward
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TemporalSparseReward(RewardBase):
|
|
||||||
def __init__(self, q_max, q_min, q_dot_max):
|
|
||||||
super(TemporalSparseReward, self).__init__(q_max, q_min, q_dot_max)
|
|
||||||
|
|
||||||
def get_reward(self, episodic_end, box_pos, box_quat, target_pos, target_quat,
|
|
||||||
rod_tip_pos, rod_quat, qpos, qvel, action):
|
|
||||||
reward = 0.
|
|
||||||
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
|
||||||
energy_cost = -0.0005 * np.sum(np.square(action))
|
|
||||||
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
|
||||||
reward += joint_penalty + tcp_box_dist_reward + energy_cost
|
|
||||||
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
|
||||||
|
|
||||||
if rod_inclined_angle > np.pi / 4:
|
|
||||||
reward -= rod_inclined_angle / (np.pi)
|
|
||||||
|
|
||||||
if not episodic_end:
|
|
||||||
return reward
|
|
||||||
|
|
||||||
box_goal_dist = np.linalg.norm(box_pos - target_pos)
|
|
||||||
|
|
||||||
box_goal_pos_dist_reward = -3.5 * box_goal_dist * 100
|
|
||||||
box_goal_rot_dist_reward = -rotation_distance(box_quat, target_quat) / np.pi * 100
|
|
||||||
|
|
||||||
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
|
|
||||||
|
|
||||||
return reward
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TemporalSpatialSparseReward(RewardBase):
|
|
||||||
def __init__(self, q_max, q_min, q_dot_max):
|
|
||||||
super(TemporalSpatialSparseReward, self).__init__(q_max, q_min, q_dot_max)
|
|
||||||
|
|
||||||
def get_reward(self, episodic_end, box_pos, box_quat, target_pos, target_quat,
|
|
||||||
rod_tip_pos, rod_quat, qpos, qvel, action):
|
|
||||||
reward = 0.
|
|
||||||
joint_penalty = self._joint_limit_violate_penalty(qpos, qvel, enable_pos_limit=True, enable_vel_limit=True)
|
|
||||||
energy_cost = -0.0005 * np.sum(np.square(action))
|
|
||||||
tcp_box_dist_reward = -2 * np.clip(np.linalg.norm(box_pos - rod_tip_pos), 0.05, 100)
|
|
||||||
reward += joint_penalty + tcp_box_dist_reward + energy_cost
|
|
||||||
rod_inclined_angle = rotation_distance(rod_quat, desired_rod_quat)
|
|
||||||
|
|
||||||
if rod_inclined_angle > np.pi / 4:
|
|
||||||
reward -= rod_inclined_angle / (np.pi)
|
|
||||||
|
|
||||||
if not episodic_end:
|
|
||||||
return reward
|
|
||||||
|
|
||||||
box_goal_dist = np.linalg.norm(box_pos - target_pos)
|
|
||||||
|
|
||||||
if box_goal_dist < 0.1:
|
|
||||||
reward += 300
|
|
||||||
box_goal_pos_dist_reward = np.clip(- 3.5 * box_goal_dist * 100 * 3, -100, 0)
|
|
||||||
box_goal_rot_dist_reward = np.clip(- rotation_distance(box_quat, target_quat)/np.pi * 100 * 1.5, -100, 0)
|
|
||||||
reward += box_goal_pos_dist_reward + box_goal_rot_dist_reward
|
|
||||||
|
|
||||||
return reward
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def BoxPushingReward(reward_type, q_max, q_min, q_dot_max):
|
|
||||||
if reward_type == 'Dense':
|
|
||||||
return DenseReward(q_max, q_min, q_dot_max)
|
|
||||||
elif reward_type == 'TemporalSparse':
|
|
||||||
return TemporalSparseReward(q_max, q_min, q_dot_max)
|
|
||||||
elif reward_type == 'TemporalSpatialSparse':
|
|
||||||
return TemporalSpatialSparseReward(q_max, q_min, q_dot_max)
|
|
||||||
else:
|
|
||||||
raise NotImplementedError
|
|
@ -157,17 +157,17 @@ def example_fully_custom_mp(seed=1, iterations=1, render=True):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
render = True
|
render = True
|
||||||
# DMP
|
# DMP
|
||||||
# example_mp("HoleReacherDMP-v0", seed=10, iterations=5, render=render)
|
example_mp("HoleReacherDMP-v0", seed=10, iterations=5, render=render)
|
||||||
|
|
||||||
# ProMP
|
# ProMP
|
||||||
# example_mp("HoleReacherProMP-v0", seed=10, iterations=5, render=render)
|
example_mp("HoleReacherProMP-v0", seed=10, iterations=5, render=render)
|
||||||
# example_mp("BoxPushingDenseProMP-v0", seed=10, iterations=50, render=render)
|
example_mp("BoxPushingTemporalSparseProMP-v0", seed=10, iterations=1, render=render)
|
||||||
|
|
||||||
# ProDMP
|
# ProDMP
|
||||||
example_mp("BoxPushingDenseProDMP-v0", seed=10, iterations=50, render=render)
|
example_mp("BoxPushingDenseProDMP-v0", seed=10, iterations=1, render=render)
|
||||||
|
|
||||||
# Altered basis functions
|
# Altered basis functions
|
||||||
# obs1 = example_custom_mp("Reacher5dProMP-v0", seed=10, iterations=1, render=render)
|
obs1 = example_custom_mp("Reacher5dProMP-v0", seed=10, iterations=1, render=render)
|
||||||
|
|
||||||
# Custom MP
|
# Custom MP
|
||||||
# example_fully_custom_mp(seed=10, iterations=1, render=render)
|
example_fully_custom_mp(seed=10, iterations=1, render=render)
|
||||||
|
Loading…
Reference in New Issue
Block a user