diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index 38abbf1..e435fa6 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -11,6 +11,13 @@ from .mujoco.reacher.alr_reacher import ALRReacherEnv from .mujoco.reacher.balancing import BalancingEnv from alr_envs.alr.mujoco.table_tennis.tt_gym import MAX_EPISODE_STEPS +from .mujoco.ant_jump.ant_jump import MAX_EPISODE_STEPS_ANTJUMP +from .mujoco.half_cheetah_jump.half_cheetah_jump import MAX_EPISODE_STEPS_HALFCHEETAHJUMP +from .mujoco.hopper_jump.hopper_jump import MAX_EPISODE_STEPS_HOPPERJUMP +from .mujoco.hopper_jump.hopper_jump_on_box import MAX_EPISODE_STEPS_HOPPERJUMPONBOX +from .mujoco.hopper_throw.hopper_throw import MAX_EPISODE_STEPS_HOPPERTHROW +from .mujoco.hopper_throw.hopper_throw_in_basket import MAX_EPISODE_STEPS_HOPPERTHROWINBASKET +from .mujoco.walker_2d_jump.walker_2d_jump import MAX_EPISODE_STEPS_WALKERJUMP ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS = {"DMP": [], "ProMP": []} @@ -185,6 +192,69 @@ register( } ) + +register( + id='ALRAntJump-v0', + entry_point='alr_envs.alr.mujoco:ALRAntJumpEnv', + max_episode_steps=MAX_EPISODE_STEPS_ANTJUMP, + kwargs={ + "max_episode_steps": MAX_EPISODE_STEPS_ANTJUMP + } +) + +register( + id='ALRHalfCheetahJump-v0', + entry_point='alr_envs.alr.mujoco:ALRHalfCheetahJumpEnv', + max_episode_steps=MAX_EPISODE_STEPS_HALFCHEETAHJUMP, + kwargs={ + "max_episode_steps": MAX_EPISODE_STEPS_HALFCHEETAHJUMP + } +) + +register( + id='ALRHopperJump-v0', + entry_point='alr_envs.alr.mujoco:ALRHopperJumpEnv', + max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMP, + kwargs={ + "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP + } +) + +register( + id='ALRHopperJumpOnBox-v0', + entry_point='alr_envs.alr.mujoco:ALRHopperJumpOnBoxEnv', + max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMPONBOX, + kwargs={ + "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMPONBOX + } +) + +register( + id='ALRHopperThrow-v0', + entry_point='alr_envs.alr.mujoco:ALRHopperThrowEnv', + max_episode_steps=MAX_EPISODE_STEPS_HOPPERTHROW, + kwargs={ + "max_episode_steps": MAX_EPISODE_STEPS_HOPPERTHROW + } +) + +register( + id='ALRHopperThrowInBasket-v0', + entry_point='alr_envs.alr.mujoco:ALRHopperThrowInBasketEnv', + max_episode_steps=MAX_EPISODE_STEPS_HOPPERTHROWINBASKET, + kwargs={ + "max_episode_steps": MAX_EPISODE_STEPS_HOPPERTHROWINBASKET + } +) + +register( + id='ALRWalker2DJump-v0', + entry_point='alr_envs.alr.mujoco:ALRWalker2dJumpEnv', + max_episode_steps=MAX_EPISODE_STEPS_WALKERJUMP, + kwargs={ + "max_episode_steps": MAX_EPISODE_STEPS_WALKERJUMP + } +) ## Balancing Reacher register( @@ -430,4 +500,205 @@ for _v, cd in enumerate(ctxt_dim): } } ) - ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) \ No newline at end of file + ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) + + +## AntJump +for _v, cd in enumerate(ctxt_dim): + _env_id = f'TableTennisProMP-v{_v}' + register( + id=_env_id, + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": "alr_envs:TableTennis{}DCtxt-v0".format(cd), + "wrappers": [mujoco.table_tennis.MPWrapper], + "mp_kwargs": { + "num_dof": 7, + "num_basis": 2, + "duration": 1.25, + "post_traj_time": 1.5, + "policy_type": "motor", + "weights_scale": 1.0, + "zero_start": True, + "zero_goal": False, + "policy_kwargs": { + "p_gains": 0.5*np.array([1.0, 4.0, 2.0, 4.0, 1.0, 4.0, 1.0]), + "d_gains": 0.5*np.array([0.1, 0.4, 0.2, 0.4, 0.1, 0.4, 0.1]) + } + } + } + ) + ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) + + +register( + id='ALRAntJumpProMP-v0', + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": "alr_envs:ALRAntJump-v0", + "wrappers": [mujoco.ant_jump.MPWrapper], + "mp_kwargs": { + "num_dof": 8, + "num_basis": 5, + "duration": 10, + "post_traj_time": 0, + "policy_type": "motor", + "weights_scale": 1.0, + "zero_start": True, + "zero_goal": False, + "policy_kwargs": { + "p_gains": np.ones(8), + "d_gains": 0.1*np.ones(8) + } + } + } +) +ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append('ALRAntJumpProMP-v0') + +register( + id='ALRHalfCheetahJumpProMP-v0', + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": "alr_envs:ALRHalfCheetahJump-v0", + "wrappers": [mujoco.half_cheetah_jump.MPWrapper], + "mp_kwargs": { + "num_dof": 6, + "num_basis": 5, + "duration": 5, + "post_traj_time": 0, + "policy_type": "motor", + "weights_scale": 1.0, + "zero_start": True, + "zero_goal": False, + "policy_kwargs": { + "p_gains": np.ones(6), + "d_gains": 0.1*np.ones(6) + } + } + } +) +ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append('ALRHalfCheetahJumpProMP-v0') + + +register( + id='ALRHopperJumpProMP-v0', + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": "alr_envs:ALRHopperJump-v0", + "wrappers": [mujoco.hopper_jump.MPWrapper], + "mp_kwargs": { + "num_dof": 3, + "num_basis": 5, + "duration": 2, + "post_traj_time": 0, + "policy_type": "motor", + "weights_scale": 1.0, + "zero_start": True, + "zero_goal": False, + "policy_kwargs": { + "p_gains": np.ones(3), + "d_gains": 0.1*np.ones(3) + } + } + } +) +ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append('ALRHopperJumpProMP-v0') + +register( + id='ALRHopperJumpOnBoxProMP-v0', + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": "alr_envs:ALRHopperJumpOnBox-v0", + "wrappers": [mujoco.hopper_jump.MPWrapper], + "mp_kwargs": { + "num_dof": 3, + "num_basis": 5, + "duration": 2, + "post_traj_time": 0, + "policy_type": "motor", + "weights_scale": 1.0, + "zero_start": True, + "zero_goal": False, + "policy_kwargs": { + "p_gains": np.ones(3), + "d_gains": 0.1*np.ones(3) + } + } + } +) +ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append('ALRHopperJumpOnBoxProMP-v0') + + +register( + id='ALRHopperThrowProMP-v0', + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": "alr_envs:ALRHopperThrow-v0", + "wrappers": [mujoco.hopper_throw.MPWrapper], + "mp_kwargs": { + "num_dof": 3, + "num_basis": 5, + "duration": 2, + "post_traj_time": 0, + "policy_type": "motor", + "weights_scale": 1.0, + "zero_start": True, + "zero_goal": False, + "policy_kwargs": { + "p_gains": np.ones(3), + "d_gains": 0.1*np.ones(3) + } + } + } +) +ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append('ALRHopperThrowProMP-v0') + + +register( + id='ALRHopperThrowInBasketProMP-v0', + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": "alr_envs:ALRHopperThrowInBasket-v0", + "wrappers": [mujoco.hopper_throw.MPWrapper], + "mp_kwargs": { + "num_dof": 3, + "num_basis": 5, + "duration": 2, + "post_traj_time": 0, + "policy_type": "motor", + "weights_scale": 1.0, + "zero_start": True, + "zero_goal": False, + "policy_kwargs": { + "p_gains": np.ones(3), + "d_gains": 0.1*np.ones(3) + } + } + } +) +ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append('ALRHopperThrowInBasketProMP-v0') + + +register( + id='ALRWalker2DJumpProMP-v0', + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": "alr_envs:ALRWalker2DJump-v0", + "wrappers": [mujoco.walker_2d_jump.MPWrapper], + "mp_kwargs": { + "num_dof": 6, + "num_basis": 5, + "duration": 2.4, + "post_traj_time": 0, + "policy_type": "motor", + "weights_scale": 1.0, + "zero_start": True, + "zero_goal": False, + "policy_kwargs": { + "p_gains": np.ones(6), + "d_gains": 0.1*np.ones(6) + } + } + } +) +ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append('ALRWalker2DJumpProMP-v0') diff --git a/alr_envs/alr/mujoco/__init__.py b/alr_envs/alr/mujoco/__init__.py index cdb3cde..8935db4 100644 --- a/alr_envs/alr/mujoco/__init__.py +++ b/alr_envs/alr/mujoco/__init__.py @@ -3,4 +3,11 @@ from .reacher.balancing import BalancingEnv from .ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv from .ball_in_a_cup.biac_pd import ALRBallInACupPDEnv from .table_tennis.tt_gym import TTEnvGym -from .beerpong.beerpong import ALRBeerBongEnv \ No newline at end of file +from .beerpong.beerpong import ALRBeerBongEnv +from .ant_jump.ant_jump import ALRAntJumpEnv +from .half_cheetah_jump.half_cheetah_jump import ALRHalfCheetahJumpEnv +from .hopper_jump.hopper_jump import ALRHopperJumpEnv +from .hopper_jump.hopper_jump_on_box import ALRHopperJumpOnBoxEnv +from .hopper_throw.hopper_throw import ALRHopperThrowEnv +from .hopper_throw.hopper_throw_in_basket import ALRHopperThrowInBasketEnv +from .walker_2d_jump.walker_2d_jump import ALRWalker2dJumpEnv \ No newline at end of file diff --git a/alr_envs/alr/mujoco/ant_jump/__init__.py b/alr_envs/alr/mujoco/ant_jump/__init__.py new file mode 100644 index 0000000..c5e6d2f --- /dev/null +++ b/alr_envs/alr/mujoco/ant_jump/__init__.py @@ -0,0 +1 @@ +from .mp_wrapper import MPWrapper diff --git a/alr_envs/alr/mujoco/ant_jump/ant_jump.py b/alr_envs/alr/mujoco/ant_jump/ant_jump.py new file mode 100644 index 0000000..09c623d --- /dev/null +++ b/alr_envs/alr/mujoco/ant_jump/ant_jump.py @@ -0,0 +1,117 @@ +import numpy as np +from gym.envs.mujoco.ant_v3 import AntEnv + +MAX_EPISODE_STEPS_ANTJUMP = 200 + + +class ALRAntJumpEnv(AntEnv): + """ + Initialization changes to normal Ant: + - healthy_reward: 1.0 -> 0.01 -> 0.0 no healthy reward needed - Paul and Marc + - ctrl_cost_weight 0.5 -> 0.0 + - contact_cost_weight: 5e-4 -> 0.0 + - healthy_z_range: (0.2, 1.0) -> (0.3, float('inf')) !!!!! Does that make sense, limiting height? + """ + + def __init__(self, + xml_file='ant.xml', + ctrl_cost_weight=0.0, + contact_cost_weight=0.0, + healthy_reward=0.0, + terminate_when_unhealthy=True, + healthy_z_range=(0.3, float('inf')), + contact_force_range=(-1.0, 1.0), + reset_noise_scale=0.1, + context=True, # variable to decide if context is used or not + exclude_current_positions_from_observation=True, + max_episode_steps=200): + self.current_step = 0 + self.max_height = 0 + self.context = context + self.max_episode_steps = max_episode_steps + self.goal = 0 # goal when training with context + super().__init__(xml_file, ctrl_cost_weight, contact_cost_weight, healthy_reward, terminate_when_unhealthy, + healthy_z_range, contact_force_range, reset_noise_scale, + exclude_current_positions_from_observation) + + def step(self, action): + + self.current_step += 1 + self.do_simulation(action, self.frame_skip) + + height = self.get_body_com("torso")[2].copy() + + self.max_height = max(height, self.max_height) + + rewards = 0 + + ctrl_cost = self.control_cost(action) + contact_cost = self.contact_cost + + costs = ctrl_cost + contact_cost + + done = height < 0.3 # fall over -> is the 0.3 value from healthy_z_range? TODO change 0.3 to the value of healthy z angle + + if self.current_step == self.max_episode_steps or done: + if self.context: + # -10 for scaling the value of the distance between the max_height and the goal height; only used when context is enabled + # height_reward = -10 * (np.linalg.norm(self.max_height - self.goal)) + height_reward = -10*np.linalg.norm(self.max_height - self.goal) + # no healthy reward when using context, because we optimize a negative value + healthy_reward = 0 + else: + height_reward = self.max_height - 0.7 + healthy_reward = self.healthy_reward * self.current_step + + rewards = height_reward + healthy_reward + + obs = self._get_obs() + reward = rewards - costs + + info = { + 'height': height, + 'max_height': self.max_height, + 'goal': self.goal + } + + return obs, reward, done, info + + def _get_obs(self): + return np.append(super()._get_obs(), self.goal) + + def reset(self): + self.current_step = 0 + self.max_height = 0 + self.goal = np.random.uniform(1.0, 2.5, + 1) # goal heights from 1.0 to 2.5; can be increased, but didnt work well with CMORE + return super().reset() + + # reset_model had to be implemented in every env to make it deterministic + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + +if __name__ == '__main__': + render_mode = "human" # "human" or "partial" or "final" + env = ALRAntJumpEnv() + obs = env.reset() + + for i in range(2000): + # objective.load_result("/tmp/cma") + # test with random actions + ac = env.action_space.sample() + obs, rew, d, info = env.step(ac) + if i % 10 == 0: + env.render(mode=render_mode) + if d: + env.reset() + + env.close() \ No newline at end of file diff --git a/alr_envs/alr/mujoco/ant_jump/assets/ant.xml b/alr_envs/alr/mujoco/ant_jump/assets/ant.xml new file mode 100644 index 0000000..ee4d679 --- /dev/null +++ b/alr_envs/alr/mujoco/ant_jump/assets/ant.xml @@ -0,0 +1,81 @@ + + + diff --git a/alr_envs/alr/mujoco/ant_jump/mp_wrapper.py b/alr_envs/alr/mujoco/ant_jump/mp_wrapper.py new file mode 100644 index 0000000..4967b64 --- /dev/null +++ b/alr_envs/alr/mujoco/ant_jump/mp_wrapper.py @@ -0,0 +1,31 @@ +from typing import Tuple, Union + +import numpy as np + +from mp_env_api import MPEnvWrapper + + +class MPWrapper(MPEnvWrapper): + + @property + def active_obs(self): + return np.hstack([ + [False] * 111, # ant has 111 dimensional observation space !! + [True] # goal height + ]) + + @property + def current_pos(self) -> Union[float, int, np.ndarray]: + return self.env.sim.data.qpos[7:15].copy() + + @property + def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: + return self.env.sim.data.qvel[6:14].copy() + + @property + def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: + raise ValueError("Goal position is not available and has to be learnt based on the environment.") + + @property + def dt(self) -> Union[float, int]: + return self.env.dt diff --git a/alr_envs/alr/mujoco/half_cheetah_jump/__init__.py b/alr_envs/alr/mujoco/half_cheetah_jump/__init__.py new file mode 100644 index 0000000..c5e6d2f --- /dev/null +++ b/alr_envs/alr/mujoco/half_cheetah_jump/__init__.py @@ -0,0 +1 @@ +from .mp_wrapper import MPWrapper diff --git a/alr_envs/alr/mujoco/half_cheetah_jump/assets/cheetah.xml b/alr_envs/alr/mujoco/half_cheetah_jump/assets/cheetah.xml new file mode 100644 index 0000000..41daadf --- /dev/null +++ b/alr_envs/alr/mujoco/half_cheetah_jump/assets/cheetah.xml @@ -0,0 +1,62 @@ + + + + + + + + + diff --git a/alr_envs/alr/mujoco/half_cheetah_jump/half_cheetah_jump.py b/alr_envs/alr/mujoco/half_cheetah_jump/half_cheetah_jump.py new file mode 100644 index 0000000..4b53a5d --- /dev/null +++ b/alr_envs/alr/mujoco/half_cheetah_jump/half_cheetah_jump.py @@ -0,0 +1,100 @@ +import os +from gym.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv +import numpy as np + +MAX_EPISODE_STEPS_HALFCHEETAHJUMP = 100 + + +class ALRHalfCheetahJumpEnv(HalfCheetahEnv): + """ + ctrl_cost_weight 0.1 -> 0.0 + """ + + def __init__(self, + xml_file='cheetah.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=0.0, + reset_noise_scale=0.1, + context=True, + exclude_current_positions_from_observation=True, + max_episode_steps=100): + self.current_step = 0 + self.max_height = 0 + self.max_episode_steps = max_episode_steps + self.goal = 0 + self.context = context + xml_file = os.path.join(os.path.dirname(__file__), "assets", xml_file) + super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, reset_noise_scale, + exclude_current_positions_from_observation) + + def step(self, action): + + self.current_step += 1 + self.do_simulation(action, self.frame_skip) + + height_after = self.get_body_com("torso")[2] + self.max_height = max(height_after, self.max_height) + + ## Didnt use fell_over, because base env also has no done condition - Paul and Marc + # fell_over = abs(self.sim.data.qpos[2]) > 2.5 # how to figure out if the cheetah fell over? -> 2.5 oke? + # TODO: Should a fall over be checked herE? + done = False + + ctrl_cost = self.control_cost(action) + costs = ctrl_cost + + if self.current_step == self.max_episode_steps: + height_goal_distance = -10*np.linalg.norm(self.max_height - self.goal) + 1e-8 if self.context \ + else self.max_height + rewards = self._forward_reward_weight * height_goal_distance + else: + rewards = 0 + + observation = self._get_obs() + reward = rewards - costs + info = { + 'height': height_after, + 'max_height': self.max_height + } + + return observation, reward, done, info + + def _get_obs(self): + return np.append(super()._get_obs(), self.goal) + + def reset(self): + self.max_height = 0 + self.current_step = 0 + self.goal = np.random.uniform(1.1, 1.6, 1) # 1.1 1.6 + return super().reset() + + # overwrite reset_model to make it deterministic + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + +if __name__ == '__main__': + render_mode = "human" # "human" or "partial" or "final" + env = ALRHalfCheetahJumpEnv() + obs = env.reset() + + for i in range(2000): + # objective.load_result("/tmp/cma") + # test with random actions + ac = env.action_space.sample() + obs, rew, d, info = env.step(ac) + if i % 10 == 0: + env.render(mode=render_mode) + if d: + print('After ', i, ' steps, done: ', d) + env.reset() + + env.close() \ No newline at end of file diff --git a/alr_envs/alr/mujoco/half_cheetah_jump/mp_wrapper.py b/alr_envs/alr/mujoco/half_cheetah_jump/mp_wrapper.py new file mode 100644 index 0000000..6179b07 --- /dev/null +++ b/alr_envs/alr/mujoco/half_cheetah_jump/mp_wrapper.py @@ -0,0 +1,30 @@ +from typing import Tuple, Union + +import numpy as np + +from mp_env_api import MPEnvWrapper + + +class MPWrapper(MPEnvWrapper): + @property + def active_obs(self): + return np.hstack([ + [False] * 17, + [True] # goal height + ]) + + @property + def current_pos(self) -> Union[float, int, np.ndarray]: + return self.env.sim.data.qpos[3:9].copy() + + @property + def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: + return self.env.sim.data.qvel[3:9].copy() + + @property + def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: + raise ValueError("Goal position is not available and has to be learnt based on the environment.") + + @property + def dt(self) -> Union[float, int]: + return self.env.dt diff --git a/alr_envs/alr/mujoco/hopper_jump/__init__.py b/alr_envs/alr/mujoco/hopper_jump/__init__.py new file mode 100644 index 0000000..c5e6d2f --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_jump/__init__.py @@ -0,0 +1 @@ +from .mp_wrapper import MPWrapper diff --git a/alr_envs/alr/mujoco/hopper_jump/assets/hopper.xml b/alr_envs/alr/mujoco/hopper_jump/assets/hopper.xml new file mode 100644 index 0000000..f18bc46 --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_jump/assets/hopper.xml @@ -0,0 +1,48 @@ + + + + + + + + diff --git a/alr_envs/alr/mujoco/hopper_jump/assets/hopper_jump_on_box.xml b/alr_envs/alr/mujoco/hopper_jump/assets/hopper_jump_on_box.xml new file mode 100644 index 0000000..69d78ff --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_jump/assets/hopper_jump_on_box.xml @@ -0,0 +1,51 @@ + + + + + + + + \ No newline at end of file diff --git a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py new file mode 100644 index 0000000..7ce8196 --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py @@ -0,0 +1,110 @@ +from gym.envs.mujoco.hopper_v3 import HopperEnv +import numpy as np + +MAX_EPISODE_STEPS_HOPPERJUMP = 250 + + +class ALRHopperJumpEnv(HopperEnv): + """ + Initialization changes to normal Hopper: + - healthy_reward: 1.0 -> 0.1 -> 0 + - healthy_angle_range: (-0.2, 0.2) -> (-float('inf'), float('inf')) + - healthy_z_range: (0.7, float('inf')) -> (0.5, float('inf')) + + """ + + def __init__(self, + xml_file='hopper.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=0.0, + penalty=0.0, + context=True, + terminate_when_unhealthy=True, + healthy_state_range=(-100.0, 100.0), + healthy_z_range=(0.5, float('inf')), + healthy_angle_range=(-float('inf'), float('inf')), + reset_noise_scale=5e-3, + exclude_current_positions_from_observation=True, + max_episode_steps=250): + self.current_step = 0 + self.max_height = 0 + self.max_episode_steps = max_episode_steps + self.penalty = penalty + self.goal = 0 + self.context = context + self.exclude_current_positions_from_observation = exclude_current_positions_from_observation + super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, terminate_when_unhealthy, + healthy_state_range, healthy_z_range, healthy_angle_range, reset_noise_scale, + exclude_current_positions_from_observation) + + def step(self, action): + + self.current_step += 1 + self.do_simulation(action, self.frame_skip) + height_after = self.get_body_com("torso")[2] + self.max_height = max(height_after, self.max_height) + + ctrl_cost = self.control_cost(action) + costs = ctrl_cost + done = False + + if self.current_step >= self.max_episode_steps: + hight_goal_distance = -10*np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height + healthy_reward = 0 if self.context else self.healthy_reward * self.current_step + height_reward = self._forward_reward_weight * hight_goal_distance # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley + rewards = height_reward + healthy_reward + + else: + # penalty for wrong start direction of first two joints; not needed, could be removed + rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0 + + observation = self._get_obs() + reward = rewards - costs + info = { + 'height' : height_after, + 'max_height': self.max_height, + 'goal' : self.goal + } + + return observation, reward, done, info + + def _get_obs(self): + return np.append(super()._get_obs(), self.goal) + + def reset(self): + self.goal = np.random.uniform(1.4, 2.3, 1) # 1.3 2.3 + self.max_height = 0 + self.current_step = 0 + return super().reset() + + # overwrite reset_model to make it deterministic + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + +if __name__ == '__main__': + render_mode = "human" # "human" or "partial" or "final" + env = ALRHopperJumpEnv() + obs = env.reset() + + for i in range(2000): + # objective.load_result("/tmp/cma") + # test with random actions + ac = env.action_space.sample() + obs, rew, d, info = env.step(ac) + if i % 10 == 0: + env.render(mode=render_mode) + if d: + print('After ', i, ' steps, done: ', d) + env.reset() + + env.close() \ No newline at end of file diff --git a/alr_envs/alr/mujoco/hopper_jump/hopper_jump_on_box.py b/alr_envs/alr/mujoco/hopper_jump/hopper_jump_on_box.py new file mode 100644 index 0000000..e5e363e --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump_on_box.py @@ -0,0 +1,170 @@ +from gym.envs.mujoco.hopper_v3 import HopperEnv + +import numpy as np +import os + +MAX_EPISODE_STEPS_HOPPERJUMPONBOX = 250 + + +class ALRHopperJumpOnBoxEnv(HopperEnv): + """ + Initialization changes to normal Hopper: + - healthy_reward: 1.0 -> 0.01 -> 0.001 + - healthy_angle_range: (-0.2, 0.2) -> (-float('inf'), float('inf')) + """ + + def __init__(self, + xml_file='hopper_jump_on_box.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=0.001, + terminate_when_unhealthy=True, + healthy_state_range=(-100.0, 100.0), + healthy_z_range=(0.7, float('inf')), + healthy_angle_range=(-float('inf'), float('inf')), + reset_noise_scale=5e-3, + context=True, + exclude_current_positions_from_observation=True, + max_episode_steps=250): + self.current_step = 0 + self.max_height = 0 + self.max_episode_steps = max_episode_steps + self.min_distance = 5000 # what value? + self.hopper_on_box = False + self.context = context + self.box_x = 1 + xml_file = os.path.join(os.path.dirname(__file__), "assets", xml_file) + super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, terminate_when_unhealthy, + healthy_state_range, healthy_z_range, healthy_angle_range, reset_noise_scale, + exclude_current_positions_from_observation) + + def step(self, action): + + self.current_step += 1 + self.do_simulation(action, self.frame_skip) + height_after = self.get_body_com("torso")[2] + foot_pos = self.get_body_com("foot") + self.max_height = max(height_after, self.max_height) + + vx, vz, vangle = self.sim.data.qvel[0:3] + + s = self.state_vector() + fell_over = not (np.isfinite(s).all() and (np.abs(s[2:]) < 100).all() and (height_after > .7)) + + box_pos = self.get_body_com("box") + box_size = 0.3 + box_height = 0.3 + box_center = (box_pos[0] + (box_size / 2), box_pos[1], box_height) + foot_length = 0.3 + foot_center = foot_pos[0] - (foot_length / 2) + + dist = np.linalg.norm(foot_pos - box_center) + + self.min_distance = min(dist, self.min_distance) + + # check if foot is on box + is_on_box_x = box_pos[0] <= foot_center <= box_pos[0] + box_size + is_on_box_y = True # is y always true because he can only move in x and z direction? + is_on_box_z = box_height - 0.02 <= foot_pos[2] <= box_height + 0.02 + is_on_box = is_on_box_x and is_on_box_y and is_on_box_z + if is_on_box: self.hopper_on_box = True + + ctrl_cost = self.control_cost(action) + + costs = ctrl_cost + + done = fell_over or self.hopper_on_box + + if self.current_step >= self.max_episode_steps or done: + done = False + + max_height = self.max_height.copy() + min_distance = self.min_distance.copy() + + alive_bonus = self._healthy_reward * self.current_step + box_bonus = 0 + rewards = 0 + + # TODO explain what we did here for the calculation of the reward + if is_on_box: + if self.context: + rewards -= 100 * vx ** 2 if 100 * vx ** 2 < 1 else 1 + else: + box_bonus = 10 + rewards += box_bonus + # rewards -= dist * dist ???? why when already on box? + # reward -= 90 - abs(angle) + rewards -= 100 * vx ** 2 if 100 * vx ** 2 < 1 else 1 + rewards += max_height * 3 + rewards += alive_bonus + + else: + if self.context: + rewards = -10 - min_distance + rewards += max_height * 3 + else: + # reward -= (dist*dist) + rewards -= min_distance * min_distance + # rewards -= dist / self.max_distance + rewards += max_height + rewards += alive_bonus + + else: + rewards = 0 + + observation = self._get_obs() + reward = rewards - costs + info = { + 'height': height_after, + 'max_height': self.max_height.copy(), + 'min_distance': self.min_distance, + 'goal': self.box_x, + } + + return observation, reward, done, info + + def _get_obs(self): + return np.append(super()._get_obs(), self.box_x) + + def reset(self): + + self.max_height = 0 + self.min_distance = 5000 + self.current_step = 0 + self.hopper_on_box = False + if self.context: + box_id = self.sim.model.body_name2id("box") + self.box_x = np.random.uniform(1, 3, 1) + self.sim.model.body_pos[box_id] = [self.box_x, 0, 0] + return super().reset() + + # overwrite reset_model to make it deterministic + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + +if __name__ == '__main__': + render_mode = "human" # "human" or "partial" or "final" + env = ALRHopperJumpOnBoxEnv() + obs = env.reset() + + for i in range(2000): + # objective.load_result("/tmp/cma") + # test with random actions + ac = env.action_space.sample() + obs, rew, d, info = env.step(ac) + if i % 10 == 0: + env.render(mode=render_mode) + if d: + print('After ', i, ' steps, done: ', d) + env.reset() + + env.close() \ No newline at end of file diff --git a/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py b/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py new file mode 100644 index 0000000..0d6768c --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py @@ -0,0 +1,31 @@ +from typing import Tuple, Union + +import numpy as np + +from mp_env_api import MPEnvWrapper + + +class MPWrapper(MPEnvWrapper): + @property + def active_obs(self): + return np.hstack([ + [False] * (5 + int(not self.exclude_current_positions_from_observation)), # position + [False] * 6, # velocity + [True] + ]) + + @property + def current_pos(self) -> Union[float, int, np.ndarray]: + return self.env.sim.data.qpos[3:6].copy() + + @property + def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: + return self.env.sim.data.qvel[3:6].copy() + + @property + def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: + raise ValueError("Goal position is not available and has to be learnt based on the environment.") + + @property + def dt(self) -> Union[float, int]: + return self.env.dt diff --git a/alr_envs/alr/mujoco/hopper_throw/__init__.py b/alr_envs/alr/mujoco/hopper_throw/__init__.py new file mode 100644 index 0000000..989b5a9 --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_throw/__init__.py @@ -0,0 +1 @@ +from .mp_wrapper import MPWrapper \ No newline at end of file diff --git a/alr_envs/alr/mujoco/hopper_throw/assets/hopper_throw.xml b/alr_envs/alr/mujoco/hopper_throw/assets/hopper_throw.xml new file mode 100644 index 0000000..1c39602 --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_throw/assets/hopper_throw.xml @@ -0,0 +1,56 @@ + + + + + + + + diff --git a/alr_envs/alr/mujoco/hopper_throw/assets/hopper_throw_in_basket.xml b/alr_envs/alr/mujoco/hopper_throw/assets/hopper_throw_in_basket.xml new file mode 100644 index 0000000..b4f0342 --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_throw/assets/hopper_throw_in_basket.xml @@ -0,0 +1,132 @@ + + + + + + + + \ No newline at end of file diff --git a/alr_envs/alr/mujoco/hopper_throw/hopper_throw.py b/alr_envs/alr/mujoco/hopper_throw/hopper_throw.py new file mode 100644 index 0000000..03553f2 --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_throw/hopper_throw.py @@ -0,0 +1,113 @@ +import os + +from gym.envs.mujoco.hopper_v3 import HopperEnv +import numpy as np + +MAX_EPISODE_STEPS_HOPPERTHROW = 250 + + +class ALRHopperThrowEnv(HopperEnv): + """ + Initialization changes to normal Hopper: + - healthy_reward: 1.0 -> 0.0 -> 0.1 + - forward_reward_weight -> 5.0 + - healthy_angle_range: (-0.2, 0.2) -> (-float('inf'), float('inf')) + + Reward changes to normal Hopper: + - velocity: (x_position_after - x_position_before) -> self.get_body_com("ball")[0] + """ + + def __init__(self, + xml_file='hopper_throw.xml', + forward_reward_weight=5.0, + ctrl_cost_weight=1e-3, + healthy_reward=0.1, + terminate_when_unhealthy=True, + healthy_state_range=(-100.0, 100.0), + healthy_z_range=(0.7, float('inf')), + healthy_angle_range=(-float('inf'), float('inf')), + reset_noise_scale=5e-3, + context = True, + exclude_current_positions_from_observation=True, + max_episode_steps=250): + xml_file = os.path.join(os.path.dirname(__file__), "assets", xml_file) + self.current_step = 0 + self.max_episode_steps = max_episode_steps + self.context = context + self.goal = 0 + super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, terminate_when_unhealthy, + healthy_state_range, healthy_z_range, healthy_angle_range, reset_noise_scale, + exclude_current_positions_from_observation) + + def step(self, action): + + self.current_step += 1 + self.do_simulation(action, self.frame_skip) + ball_pos_after = self.get_body_com("ball")[0] #abs(self.get_body_com("ball")[0]) # use x and y to get point and use euclid distance as reward? + ball_pos_after_y = self.get_body_com("ball")[2] + + # done = self.done TODO We should use this, not sure why there is no other termination; ball_landed should be enough, because we only look at the throw itself? - Paul and Marc + ball_landed = self.get_body_com("ball")[2] <= 0.05 + done = ball_landed + + ctrl_cost = self.control_cost(action) + costs = ctrl_cost + + rewards = 0 + + if self.current_step >= self.max_episode_steps or done: + distance_reward = -np.linalg.norm(ball_pos_after - self.goal) if self.context else \ + self._forward_reward_weight * ball_pos_after + healthy_reward = 0 if self.context else self.healthy_reward * self.current_step + + rewards = distance_reward + healthy_reward + + observation = self._get_obs() + reward = rewards - costs + info = { + 'ball_pos': ball_pos_after, + 'ball_pos_y': ball_pos_after_y, + 'current_step' : self.current_step, + 'goal' : self.goal, + } + + return observation, reward, done, info + + def _get_obs(self): + return np.append(super()._get_obs(), self.goal) + + def reset(self): + self.current_step = 0 + self.goal = self.goal = np.random.uniform(2.0, 6.0, 1) # 0.5 8.0 + return super().reset() + + # overwrite reset_model to make it deterministic + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + +if __name__ == '__main__': + render_mode = "human" # "human" or "partial" or "final" + env = ALRHopperThrowEnv() + obs = env.reset() + + for i in range(2000): + # objective.load_result("/tmp/cma") + # test with random actions + ac = env.action_space.sample() + obs, rew, d, info = env.step(ac) + if i % 10 == 0: + env.render(mode=render_mode) + if d: + print('After ', i, ' steps, done: ', d) + env.reset() + + env.close() \ No newline at end of file diff --git a/alr_envs/alr/mujoco/hopper_throw/hopper_throw_in_basket.py b/alr_envs/alr/mujoco/hopper_throw/hopper_throw_in_basket.py new file mode 100644 index 0000000..74a5b21 --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_throw/hopper_throw_in_basket.py @@ -0,0 +1,143 @@ +import os +from gym.envs.mujoco.hopper_v3 import HopperEnv + +import numpy as np + + +MAX_EPISODE_STEPS_HOPPERTHROWINBASKET = 250 + + +class ALRHopperThrowInBasketEnv(HopperEnv): + """ + Initialization changes to normal Hopper: + - healthy_reward: 1.0 -> 0.0 + - healthy_angle_range: (-0.2, 0.2) -> (-float('inf'), float('inf')) + - hit_basket_reward: - -> 10 + + Reward changes to normal Hopper: + - velocity: (x_position_after - x_position_before) -> (ball_position_after - ball_position_before) + """ + + def __init__(self, + xml_file='hopper_throw_in_basket.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=0.0, + hit_basket_reward=10, + basket_size=0.3, + terminate_when_unhealthy=True, + healthy_state_range=(-100.0, 100.0), + healthy_z_range=(0.7, float('inf')), + healthy_angle_range=(-float('inf'), float('inf')), + reset_noise_scale=5e-3, + context=True, + penalty=0.0, + exclude_current_positions_from_observation=True, + max_episode_steps = 250): + self.hit_basket_reward = hit_basket_reward + self.current_step = 0 + self.max_episode_steps = max_episode_steps + self.ball_in_basket = False + self.basket_size = basket_size + self.context = context + self.penalty = penalty + self.basket_x = 5 + xml_file = os.path.join(os.path.dirname(__file__), "assets", xml_file) + super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, terminate_when_unhealthy, + healthy_state_range, healthy_z_range, healthy_angle_range, reset_noise_scale, + exclude_current_positions_from_observation) + + def step(self, action): + + self.current_step += 1 + self.do_simulation(action, self.frame_skip) + ball_pos = self.get_body_com("ball") + + basket_pos = self.get_body_com("basket_ground") + basket_center = (basket_pos[0] + 0.5, basket_pos[1], basket_pos[2]) + + is_in_basket_x = ball_pos[0] >= basket_pos[0] and ball_pos[0] <= basket_pos[0] + self.basket_size + is_in_basket_y = ball_pos[1] >= basket_pos[1] - (self.basket_size/2) and ball_pos[1] <= basket_pos[1] + (self.basket_size/2) + is_in_basket_z = ball_pos[2] < 0.1 + is_in_basket = is_in_basket_x and is_in_basket_y and is_in_basket_z + if is_in_basket: self.ball_in_basket = True + + ball_landed = self.get_body_com("ball")[2] <= 0.05 + done = ball_landed or is_in_basket + + rewards = 0 + + ctrl_cost = self.control_cost(action) + + costs = ctrl_cost + + if self.current_step >= self.max_episode_steps or done: + + if is_in_basket: + if not self.context: + rewards += self.hit_basket_reward + else: + dist = np.linalg.norm(ball_pos-basket_center) + if self.context: + rewards = -10 * dist + else: + rewards -= (dist*dist) + else: + # penalty not needed + rewards += ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0 #too much of a penalty? + + observation = self._get_obs() + reward = rewards - costs + info = { + 'ball_pos': ball_pos[0], + } + + return observation, reward, done, info + + def _get_obs(self): + return np.append(super()._get_obs(), self.basket_x) + + def reset(self): + if self.max_episode_steps == 10: + # We have to initialize this here, because the spec is only added after creating the env. + self.max_episode_steps = self.spec.max_episode_steps + + self.current_step = 0 + self.ball_in_basket = False + if self.context: + basket_id = self.sim.model.body_name2id("basket_ground") + self.basket_x = np.random.uniform(3, 7, 1) + self.sim.model.body_pos[basket_id] = [self.basket_x, 0, 0] + return super().reset() + + # overwrite reset_model to make it deterministic + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + + +if __name__ == '__main__': + render_mode = "human" # "human" or "partial" or "final" + env = ALRHopperThrowInBasketEnv() + obs = env.reset() + + for i in range(2000): + # objective.load_result("/tmp/cma") + # test with random actions + ac = env.action_space.sample() + obs, rew, d, info = env.step(ac) + if i % 10 == 0: + env.render(mode=render_mode) + if d: + print('After ', i, ' steps, done: ', d) + env.reset() + + env.close() \ No newline at end of file diff --git a/alr_envs/alr/mujoco/hopper_throw/mp_wrapper.py b/alr_envs/alr/mujoco/hopper_throw/mp_wrapper.py new file mode 100644 index 0000000..e5e9486 --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_throw/mp_wrapper.py @@ -0,0 +1,30 @@ +from typing import Tuple, Union + +import numpy as np + +from mp_env_api import MPEnvWrapper + + +class MPWrapper(MPEnvWrapper): + @property + def active_obs(self): + return np.hstack([ + [False] * 17, + [True] # goal pos + ]) + + @property + def current_pos(self) -> Union[float, int, np.ndarray]: + return self.env.sim.data.qpos[3:6].copy() + + @property + def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: + return self.env.sim.data.qvel[3:6].copy() + + @property + def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: + raise ValueError("Goal position is not available and has to be learnt based on the environment.") + + @property + def dt(self) -> Union[float, int]: + return self.env.dt diff --git a/alr_envs/alr/mujoco/walker_2d_jump/__init__.py b/alr_envs/alr/mujoco/walker_2d_jump/__init__.py new file mode 100644 index 0000000..989b5a9 --- /dev/null +++ b/alr_envs/alr/mujoco/walker_2d_jump/__init__.py @@ -0,0 +1 @@ +from .mp_wrapper import MPWrapper \ No newline at end of file diff --git a/alr_envs/alr/mujoco/walker_2d_jump/assets/walker2d.xml b/alr_envs/alr/mujoco/walker_2d_jump/assets/walker2d.xml new file mode 100644 index 0000000..1b48e36 --- /dev/null +++ b/alr_envs/alr/mujoco/walker_2d_jump/assets/walker2d.xml @@ -0,0 +1,62 @@ + + + + + + + \ No newline at end of file diff --git a/alr_envs/alr/mujoco/walker_2d_jump/mp_wrapper.py b/alr_envs/alr/mujoco/walker_2d_jump/mp_wrapper.py new file mode 100644 index 0000000..445fa40 --- /dev/null +++ b/alr_envs/alr/mujoco/walker_2d_jump/mp_wrapper.py @@ -0,0 +1,30 @@ +from typing import Tuple, Union + +import numpy as np + +from mp_env_api import MPEnvWrapper + + +class MPWrapper(MPEnvWrapper): + @property + def active_obs(self): + return np.hstack([ + [False] * 17, + [True] # goal pos + ]) + + @property + def current_pos(self) -> Union[float, int, np.ndarray]: + return self.env.sim.data.qpos[3:9].copy() + + @property + def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: + return self.env.sim.data.qvel[3:9].copy() + + @property + def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: + raise ValueError("Goal position is not available and has to be learnt based on the environment.") + + @property + def dt(self) -> Union[float, int]: + return self.env.dt diff --git a/alr_envs/alr/mujoco/walker_2d_jump/walker_2d_jump.py b/alr_envs/alr/mujoco/walker_2d_jump/walker_2d_jump.py new file mode 100644 index 0000000..009dd9d --- /dev/null +++ b/alr_envs/alr/mujoco/walker_2d_jump/walker_2d_jump.py @@ -0,0 +1,113 @@ +import os +from gym.envs.mujoco.walker2d_v3 import Walker2dEnv +import numpy as np + +MAX_EPISODE_STEPS_WALKERJUMP = 300 + + +class ALRWalker2dJumpEnv(Walker2dEnv): + """ + healthy reward 1.0 -> 0.005 -> 0.0025 not from alex + penalty 10 -> 0 not from alex + """ + + def __init__(self, + xml_file='walker2d.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=0.0025, + terminate_when_unhealthy=True, + healthy_z_range=(0.8, 2.0), + healthy_angle_range=(-1.0, 1.0), + reset_noise_scale=5e-3, + penalty=0, + context=True, + exclude_current_positions_from_observation=True, + max_episode_steps=300): + self.current_step = 0 + self.max_episode_steps = max_episode_steps + self.max_height = 0 + self._penalty = penalty + self.context = context + self.goal = 0 + xml_file = os.path.join(os.path.dirname(__file__), "assets", xml_file) + super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, terminate_when_unhealthy, + healthy_z_range, healthy_angle_range, reset_noise_scale, + exclude_current_positions_from_observation) + + def step(self, action): + self.current_step += 1 + self.do_simulation(action, self.frame_skip) + #pos_after = self.get_body_com("torso")[0] + height = self.get_body_com("torso")[2] + + self.max_height = max(height, self.max_height) + + fell_over = height < 0.2 + done = fell_over + + ctrl_cost = self.control_cost(action) + costs = ctrl_cost + + if self.current_step >= self.max_episode_steps or done: + done = True + height_goal_distance = -10 * (np.linalg.norm(self.max_height - self.goal)) if self.context else self.max_height + healthy_reward = self.healthy_reward * self.current_step + + rewards = height_goal_distance + healthy_reward + else: + # penalty not needed + rewards = 0 + rewards += ((action[:2] > 0) * self._penalty).sum() if self.current_step < 4 else 0 + rewards += ((action[3:5] > 0) * self._penalty).sum() if self.current_step < 4 else 0 + + + observation = self._get_obs() + reward = rewards - costs + info = { + 'height': height, + 'max_height': self.max_height, + 'goal' : self.goal, + } + + return observation, reward, done, info + + def _get_obs(self): + return np.append(super()._get_obs(), self.goal) + + def reset(self): + self.current_step = 0 + self.max_height = 0 + self.goal = np.random.uniform(1.5, 2.5, 1) # 1.5 3.0 + return super().reset() + + # overwrite reset_model to make it deterministic + def reset_model(self): + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + + self.set_state(qpos, qvel) + + observation = self._get_obs() + return observation + +if __name__ == '__main__': + render_mode = "human" # "human" or "partial" or "final" + env = ALRWalker2dJumpEnv() + obs = env.reset() + + for i in range(2000): + # objective.load_result("/tmp/cma") + # test with random actions + ac = env.action_space.sample() + obs, rew, d, info = env.step(ac) + if i % 10 == 0: + env.render(mode=render_mode) + if d: + print('After ', i, ' steps, done: ', d) + env.reset() + + env.close() \ No newline at end of file