diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index 8a7140d..2e23d44 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -97,6 +97,7 @@ register( "hole_depth": 1, "hole_x": None, "collision_penalty": 100, + "rew_fct": "unbounded" } ) @@ -354,7 +355,7 @@ for _v in _versions: "wrappers": [classic_control.hole_reacher.MPWrapper], "mp_kwargs": { "num_dof": 5, - "num_basis": 5, + "num_basis": 3, "duration": 2, "policy_type": "velocity", "weights_scale": 5, @@ -402,7 +403,7 @@ for _v in _versions: "wrappers": [mujoco.reacher.MPWrapper], "mp_kwargs": { "num_dof": 5 if "long" not in _v.lower() else 7, - "num_basis": 1, + "num_basis": 2, "duration": 4, "policy_type": "motor", "weights_scale": 5, diff --git a/alr_envs/alr/mujoco/reacher/alr_reacher.py b/alr_envs/alr/mujoco/reacher/alr_reacher.py index c2b5f18..b436fdd 100644 --- a/alr_envs/alr/mujoco/reacher/alr_reacher.py +++ b/alr_envs/alr/mujoco/reacher/alr_reacher.py @@ -39,14 +39,18 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle): reward_dist = 0.0 angular_vel = 0.0 reward_balance = 0.0 + is_delayed = self.steps_before_reward > 0 + reward_ctrl = - np.square(a).sum() if self._steps >= self.steps_before_reward: vec = self.get_body_com("fingertip") - self.get_body_com("target") reward_dist -= self.reward_weight * np.linalg.norm(vec) - if self.steps_before_reward > 0: + if is_delayed: # avoid giving this penalty for normal step based case # angular_vel -= 10 * np.linalg.norm(self.sim.data.qvel.flat[:self.n_links]) angular_vel -= 10 * np.square(self.sim.data.qvel.flat[:self.n_links]).sum() - reward_ctrl = - 10 * np.square(a).sum() + if is_delayed: + # Higher control penalty for sparse reward per timestep + reward_ctrl *= 10 if self.balance: reward_balance -= self.balance_weight * np.abs(