reacher adjustments

This commit is contained in:
Fabian 2022-05-05 16:48:59 +02:00
parent d313795cec
commit 1881c14a48
2 changed files with 9 additions and 4 deletions

View File

@ -97,6 +97,7 @@ register(
"hole_depth": 1, "hole_depth": 1,
"hole_x": None, "hole_x": None,
"collision_penalty": 100, "collision_penalty": 100,
"rew_fct": "unbounded"
} }
) )
@ -354,7 +355,7 @@ for _v in _versions:
"wrappers": [classic_control.hole_reacher.MPWrapper], "wrappers": [classic_control.hole_reacher.MPWrapper],
"mp_kwargs": { "mp_kwargs": {
"num_dof": 5, "num_dof": 5,
"num_basis": 5, "num_basis": 3,
"duration": 2, "duration": 2,
"policy_type": "velocity", "policy_type": "velocity",
"weights_scale": 5, "weights_scale": 5,
@ -402,7 +403,7 @@ for _v in _versions:
"wrappers": [mujoco.reacher.MPWrapper], "wrappers": [mujoco.reacher.MPWrapper],
"mp_kwargs": { "mp_kwargs": {
"num_dof": 5 if "long" not in _v.lower() else 7, "num_dof": 5 if "long" not in _v.lower() else 7,
"num_basis": 1, "num_basis": 2,
"duration": 4, "duration": 4,
"policy_type": "motor", "policy_type": "motor",
"weights_scale": 5, "weights_scale": 5,

View File

@ -39,14 +39,18 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle):
reward_dist = 0.0 reward_dist = 0.0
angular_vel = 0.0 angular_vel = 0.0
reward_balance = 0.0 reward_balance = 0.0
is_delayed = self.steps_before_reward > 0
reward_ctrl = - np.square(a).sum()
if self._steps >= self.steps_before_reward: if self._steps >= self.steps_before_reward:
vec = self.get_body_com("fingertip") - self.get_body_com("target") vec = self.get_body_com("fingertip") - self.get_body_com("target")
reward_dist -= self.reward_weight * np.linalg.norm(vec) reward_dist -= self.reward_weight * np.linalg.norm(vec)
if self.steps_before_reward > 0: if is_delayed:
# avoid giving this penalty for normal step based case # avoid giving this penalty for normal step based case
# angular_vel -= 10 * np.linalg.norm(self.sim.data.qvel.flat[:self.n_links]) # angular_vel -= 10 * np.linalg.norm(self.sim.data.qvel.flat[:self.n_links])
angular_vel -= 10 * np.square(self.sim.data.qvel.flat[:self.n_links]).sum() angular_vel -= 10 * np.square(self.sim.data.qvel.flat[:self.n_links]).sum()
reward_ctrl = - 10 * np.square(a).sum() if is_delayed:
# Higher control penalty for sparse reward per timestep
reward_ctrl *= 10
if self.balance: if self.balance:
reward_balance -= self.balance_weight * np.abs( reward_balance -= self.balance_weight * np.abs(