diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index 1b3f118..91881f0 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -253,6 +253,15 @@ register( "context": True } ) + +register( + id='ALRHopperJump-v2', + entry_point='alr_envs.alr.mujoco:ALRHopperJumpRndmPosEnv', + max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMP, + kwargs={ + "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP + } +) # CtxtFree are v0, Contextual are v1 register( id='ALRHopperJumpOnBox-v0', @@ -667,6 +676,31 @@ for _v in _versions: ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) +## HopperJump +register( + id= "ALRHopperJumpProMP-v2", + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": f"alr_envs:ALRHopperJump-v2", + "wrappers": [mujoco.hopper_jump.HighCtxtMPWrapper], + "mp_kwargs": { + "num_dof": 3, + "num_basis": 5, + "duration": 2, + "post_traj_time": 0, + "policy_type": "motor", + "weights_scale": 1.0, + "zero_start": True, + "zero_goal": False, + "policy_kwargs": { + "p_gains": np.ones(3), + "d_gains": 0.1*np.ones(3) + } + } + } +) +ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("ALRHopperJumpProMP-v2") + ## HopperJumpOnBox _versions = ["v0", "v1"] for _v in _versions: diff --git a/alr_envs/alr/mujoco/__init__.py b/alr_envs/alr/mujoco/__init__.py index 8935db4..df71924 100644 --- a/alr_envs/alr/mujoco/__init__.py +++ b/alr_envs/alr/mujoco/__init__.py @@ -6,7 +6,7 @@ from .table_tennis.tt_gym import TTEnvGym from .beerpong.beerpong import ALRBeerBongEnv from .ant_jump.ant_jump import ALRAntJumpEnv from .half_cheetah_jump.half_cheetah_jump import ALRHalfCheetahJumpEnv -from .hopper_jump.hopper_jump import ALRHopperJumpEnv +from .hopper_jump.hopper_jump import ALRHopperJumpEnv, ALRHopperJumpRndmPosEnv from .hopper_jump.hopper_jump_on_box import ALRHopperJumpOnBoxEnv from .hopper_throw.hopper_throw import ALRHopperThrowEnv from .hopper_throw.hopper_throw_in_basket import ALRHopperThrowInBasketEnv diff --git a/alr_envs/alr/mujoco/hopper_jump/__init__.py b/alr_envs/alr/mujoco/hopper_jump/__init__.py index c5e6d2f..c6db9c2 100644 --- a/alr_envs/alr/mujoco/hopper_jump/__init__.py +++ b/alr_envs/alr/mujoco/hopper_jump/__init__.py @@ -1 +1 @@ -from .mp_wrapper import MPWrapper +from .mp_wrapper import MPWrapper, HighCtxtMPWrapper diff --git a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py index 7ce8196..4047d04 100644 --- a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py +++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py @@ -91,6 +91,54 @@ class ALRHopperJumpEnv(HopperEnv): observation = self._get_obs() return observation + +class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv): + def __init__(self, max_episode_steps=250): + super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False, + reset_noise_scale=5e-1, + max_episode_steps=max_episode_steps) + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel #+ self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + def step(self, action): + + self.current_step += 1 + self.do_simulation(action, self.frame_skip) + height_after = self.get_body_com("torso")[2] + self.max_height = max(height_after, self.max_height) + + ctrl_cost = self.control_cost(action) + costs = ctrl_cost + done = False + + if self.current_step >= self.max_episode_steps: + healthy_reward = 0 + height_reward = self._forward_reward_weight * self.max_height # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley + rewards = height_reward + healthy_reward + + else: + # penalty for wrong start direction of first two joints; not needed, could be removed + rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0 + + observation = self._get_obs() + reward = rewards - costs + info = { + 'height': height_after, + 'max_height': self.max_height, + 'goal': self.goal + } + + return observation, reward, done, info + if __name__ == '__main__': render_mode = "human" # "human" or "partial" or "final" env = ALRHopperJumpEnv() diff --git a/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py b/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py index 0d6768c..36b7158 100644 --- a/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py +++ b/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py @@ -29,3 +29,29 @@ class MPWrapper(MPEnvWrapper): @property def dt(self) -> Union[float, int]: return self.env.dt + + +class HighCtxtMPWrapper(MPWrapper): + @property + def active_obs(self): + return np.hstack([ + [True] * (5 + int(not self.exclude_current_positions_from_observation)), # position + [False] * 6, # velocity + [False] + ]) + + @property + def current_pos(self) -> Union[float, int, np.ndarray]: + return self.env.sim.data.qpos[3:6].copy() + + @property + def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: + return self.env.sim.data.qvel[3:6].copy() + + @property + def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: + raise ValueError("Goal position is not available and has to be learnt based on the environment.") + + @property + def dt(self) -> Union[float, int]: + return self.env.dt