diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index 1bcb02e..cdff3a1 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -118,33 +118,30 @@ for _dims in [5, 7]: } ) -## Hopper Jump random joints and desired position register( id='HopperJumpSparse-v0', - entry_point='alr_envs.alr.mujoco:ALRHopperXYJumpEnv', + entry_point='alr_envs.alr.mujoco:HopperJumpEnv', max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMP, kwargs={ - # "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP, "sparse": True, - # "healthy_reward": 1.0 } ) -## Hopper Jump random joints and desired position step based reward register( id='HopperJump-v0', - entry_point='alr_envs.alr.mujoco:ALRHopperXYJumpEnvStepBased', + entry_point='alr_envs.alr.mujoco:HopperJumpEnv', max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMP, kwargs={ - # "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP, "sparse": False, - # "healthy_reward": 1.0 + "healthy_reward": 1.0, + "contact_weight": 0.0, + "height_weight": 3.0, } ) register( id='ALRAntJump-v0', - entry_point='alr_envs.alr.mujoco:ALRAntJumpEnv', + entry_point='alr_envs.alr.mujoco:AntJumpEnv', max_episode_steps=MAX_EPISODE_STEPS_ANTJUMP, kwargs={ "max_episode_steps": MAX_EPISODE_STEPS_ANTJUMP, @@ -266,7 +263,7 @@ for _v in _versions: entry_point='alr_envs.utils.make_env_helpers:make_dmp_env_helper', # max_episode_steps=1, kwargs={ - "name": f"alr_envs:{_v}", + "name": f"{_v}", "wrappers": [classic_control.simple_reacher.MPWrapper], "traj_gen_kwargs": { "num_dof": 2 if "long" not in _v.lower() else 5, @@ -304,7 +301,7 @@ register( entry_point='alr_envs.utils.make_env_helpers:make_dmp_env_helper', # max_episode_steps=1, kwargs={ - "name": "alr_envs:ViaPointReacher-v0", + "name": "ViaPointReacher-v0", "wrappers": [classic_control.viapoint_reacher.MPWrapper], "traj_gen_kwargs": { "num_dof": 5, @@ -340,7 +337,7 @@ for _v in _versions: entry_point='alr_envs.utils.make_env_helpers:make_dmp_env_helper', # max_episode_steps=1, kwargs={ - "name": f"alr_envs:HoleReacher-{_v}", + "name": f"HoleReacher-{_v}", "wrappers": [classic_control.hole_reacher.MPWrapper], "traj_gen_kwargs": { "num_dof": 5, @@ -362,7 +359,7 @@ for _v in _versions: kwargs_dict_hole_reacher_promp['wrappers'].append(classic_control.hole_reacher.MPWrapper) kwargs_dict_hole_reacher_promp['trajectory_generator_kwargs']['weight_scale'] = 2 kwargs_dict_hole_reacher_promp['controller_kwargs']['controller_type'] = 'velocity' - kwargs_dict_hole_reacher_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_hole_reacher_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', @@ -370,8 +367,8 @@ for _v in _versions: ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) -## ALRReacher -_versions = ["ALRReacher-v0", "ALRLongReacher-v0", "ALRReacherSparse-v0", "ALRLongReacherSparse-v0"] +## ReacherNd +_versions = ["Reacher5d-v0", "Reacher7d-v0", "Reacher5dSparse-v0", "Reacher7dSparse-v0"] for _v in _versions: _name = _v.split("-") _env_id = f'{_name[0]}DMP-{_name[1]}' @@ -380,7 +377,7 @@ for _v in _versions: entry_point='alr_envs.utils.make_env_helpers:make_dmp_env_helper', # max_episode_steps=1, kwargs={ - "name": f"alr_envs:{_v}", + "name": f"{_v}", "wrappers": [mujoco.reacher.MPWrapper], "traj_gen_kwargs": { "num_dof": 5 if "long" not in _v.lower() else 7, @@ -404,10 +401,10 @@ for _v in _versions: kwargs_dict_alr_reacher_promp['wrappers'].append(mujoco.reacher.MPWrapper) kwargs_dict_alr_reacher_promp['controller_kwargs']['p_gains'] = 1 kwargs_dict_alr_reacher_promp['controller_kwargs']['d_gains'] = 0.1 - kwargs_dict_alr_reacher_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_alr_reacher_promp['name'] = f"{_v}" register( id=_env_id, - entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + entry_point='alr_envs.utils.make_env_helpers:make_bb_env_helper', kwargs=kwargs_dict_alr_reacher_promp ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) @@ -425,7 +422,7 @@ for _v in _versions: kwargs_dict_bp_promp['controller_kwargs']['d_gains'] = np.array([0.02333333, 0.1, 0.0625, 0.08, 0.03, 0.03, 0.0125]) kwargs_dict_bp_promp['basis_generator_kwargs']['num_basis'] = 2 kwargs_dict_bp_promp['basis_generator_kwargs']['num_basis_zero_start'] = 2 - kwargs_dict_bp_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_bp_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_bb_env_helper', @@ -445,7 +442,7 @@ for _v in _versions: kwargs_dict_bp_promp['controller_kwargs']['d_gains'] = np.array([0.02333333, 0.1, 0.0625, 0.08, 0.03, 0.03, 0.0125]) kwargs_dict_bp_promp['basis_generator_kwargs']['num_basis'] = 2 kwargs_dict_bp_promp['basis_generator_kwargs']['num_basis_zero_start'] = 2 - kwargs_dict_bp_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_bp_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper', @@ -465,7 +462,7 @@ for _v in _versions: _env_id = f'{_name[0]}ProMP-{_name[1]}' kwargs_dict_ant_jump_promp = deepcopy(DEFAULT_BB_DICT) kwargs_dict_ant_jump_promp['wrappers'].append(mujoco.ant_jump.MPWrapper) - kwargs_dict_ant_jump_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_ant_jump_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper', @@ -482,7 +479,7 @@ for _v in _versions: _env_id = f'{_name[0]}ProMP-{_name[1]}' kwargs_dict_halfcheetah_jump_promp = deepcopy(DEFAULT_BB_DICT) kwargs_dict_halfcheetah_jump_promp['wrappers'].append(mujoco.half_cheetah_jump.MPWrapper) - kwargs_dict_halfcheetah_jump_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_halfcheetah_jump_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', @@ -502,7 +499,7 @@ for _v in _versions: _env_id = f'{_name[0]}ProMP-{_name[1]}' kwargs_dict_hopper_jump_promp = deepcopy(DEFAULT_BB_DICT) kwargs_dict_hopper_jump_promp['wrappers'].append(mujoco.hopper_jump.MPWrapper) - kwargs_dict_hopper_jump_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_hopper_jump_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper', @@ -520,7 +517,7 @@ for _v in _versions: _env_id = f'{_name[0]}ProMP-{_name[1]}' kwargs_dict_walker2d_jump_promp = deepcopy(DEFAULT_BB_DICT) kwargs_dict_walker2d_jump_promp['wrappers'].append(mujoco.walker_2d_jump.MPWrapper) - kwargs_dict_walker2d_jump_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_walker2d_jump_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', @@ -583,7 +580,7 @@ register( # CtxtFree are v0, Contextual are v1 register( id='ALRAntJump-v0', - entry_point='alr_envs.alr.mujoco:ALRAntJumpEnv', + entry_point='alr_envs.alr.mujoco:AntJumpEnv', max_episode_steps=MAX_EPISODE_STEPS_ANTJUMP, kwargs={ "max_episode_steps": MAX_EPISODE_STEPS_ANTJUMP, @@ -602,7 +599,7 @@ register( ) register( id='ALRHopperJump-v0', - entry_point='alr_envs.alr.mujoco:ALRHopperJumpEnv', + entry_point='alr_envs.alr.mujoco:HopperJumpEnv', max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMP, kwargs={ "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP, @@ -649,7 +646,7 @@ for i in _vs: id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', kwargs={ - "name": f"alr_envs:{_env_id.replace('ProMP', '')}", + "name": f"{_env_id.replace('ProMP', '')}", "wrappers": [mujoco.reacher.MPWrapper], "mp_kwargs": { "num_dof": 5, @@ -672,7 +669,7 @@ for i in _vs: id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', kwargs={ - "name": f"alr_envs:{_env_id.replace('ProMP', '')}", + "name": f"{_env_id.replace('ProMP', '')}", "wrappers": [mujoco.reacher.MPWrapper], "mp_kwargs": { "num_dof": 5, diff --git a/alr_envs/alr/classic_control/base_reacher/base_reacher.py b/alr_envs/alr/classic_control/base_reacher/base_reacher.py index e76ce85..1af8187 100644 --- a/alr_envs/alr/classic_control/base_reacher/base_reacher.py +++ b/alr_envs/alr/classic_control/base_reacher/base_reacher.py @@ -1,9 +1,10 @@ from abc import ABC, abstractmethod -from typing import Union +from typing import Union, Tuple, Optional import gym import numpy as np from gym import spaces +from gym.core import ObsType from gym.utils import seeding from alr_envs.alr.classic_control.utils import intersect @@ -14,8 +15,7 @@ class BaseReacherEnv(gym.Env, ABC): Base class for all reaching environments. """ - def __init__(self, n_links: int, random_start: bool = True, - allow_self_collision: bool = False): + def __init__(self, n_links: int, random_start: bool = True, allow_self_collision: bool = False): super().__init__() self.link_lengths = np.ones(n_links) self.n_links = n_links @@ -70,7 +70,8 @@ class BaseReacherEnv(gym.Env, ABC): def current_vel(self): return self._angle_velocity.copy() - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, + options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: # Sample only orientation of first link, i.e. the arm is always straight. if self.random_start: first_joint = self.np_random.uniform(np.pi / 4, 3 * np.pi / 4) diff --git a/alr_envs/alr/classic_control/hole_reacher/hole_reacher.py b/alr_envs/alr/classic_control/hole_reacher/hole_reacher.py index 883be8c..8f0122f 100644 --- a/alr_envs/alr/classic_control/hole_reacher/hole_reacher.py +++ b/alr_envs/alr/classic_control/hole_reacher/hole_reacher.py @@ -1,8 +1,9 @@ -from typing import Union +from typing import Union, Optional, Tuple import gym import matplotlib.pyplot as plt import numpy as np +from gym.core import ObsType from matplotlib import patches from alr_envs.alr.classic_control.base_reacher.base_reacher_direct import BaseReacherDirectEnv @@ -51,7 +52,8 @@ class HoleReacherEnv(BaseReacherDirectEnv): else: raise ValueError("Unknown reward function {}".format(rew_fct)) - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, + options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: self._generate_hole() self._set_patches() self.reward_function.reset() @@ -223,6 +225,7 @@ class HoleReacherEnv(BaseReacherDirectEnv): if __name__ == "__main__": import time + env = HoleReacherEnv(5) env.reset() diff --git a/alr_envs/alr/classic_control/hole_reacher/mp_wrapper.py b/alr_envs/alr/classic_control/hole_reacher/mp_wrapper.py index 0a5eaa6..00a5eb8 100644 --- a/alr_envs/alr/classic_control/hole_reacher/mp_wrapper.py +++ b/alr_envs/alr/classic_control/hole_reacher/mp_wrapper.py @@ -7,6 +7,7 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): + @property def context_mask(self): return np.hstack([ [self.env.random_start] * self.env.n_links, # cos @@ -25,7 +26,3 @@ class MPWrapper(RawInterfaceWrapper): @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: return self.env.current_vel - - @property - def dt(self) -> Union[float, int]: - return self.env.dt diff --git a/alr_envs/alr/classic_control/simple_reacher/mp_wrapper.py b/alr_envs/alr/classic_control/simple_reacher/mp_wrapper.py index c5ef66f..3fa12d8 100644 --- a/alr_envs/alr/classic_control/simple_reacher/mp_wrapper.py +++ b/alr_envs/alr/classic_control/simple_reacher/mp_wrapper.py @@ -7,6 +7,7 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): + @property def context_mask(self): return np.hstack([ [self.env.random_start] * self.env.n_links, # cos @@ -23,7 +24,3 @@ class MPWrapper(RawInterfaceWrapper): @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: return self.env.current_vel - - @property - def dt(self) -> Union[float, int]: - return self.env.dt diff --git a/alr_envs/alr/classic_control/simple_reacher/simple_reacher.py b/alr_envs/alr/classic_control/simple_reacher/simple_reacher.py index dac06a3..eb079d0 100644 --- a/alr_envs/alr/classic_control/simple_reacher/simple_reacher.py +++ b/alr_envs/alr/classic_control/simple_reacher/simple_reacher.py @@ -1,8 +1,9 @@ -from typing import Iterable, Union +from typing import Iterable, Union, Optional, Tuple import matplotlib.pyplot as plt import numpy as np from gym import spaces +from gym.core import ObsType from alr_envs.alr.classic_control.base_reacher.base_reacher_torque import BaseReacherTorqueEnv @@ -15,7 +16,7 @@ class SimpleReacherEnv(BaseReacherTorqueEnv): """ def __init__(self, n_links: int, target: Union[None, Iterable] = None, random_start: bool = True, - allow_self_collision: bool = False,): + allow_self_collision: bool = False, ): super().__init__(n_links, random_start, allow_self_collision) # provided initial parameters @@ -41,7 +42,8 @@ class SimpleReacherEnv(BaseReacherTorqueEnv): # def start_pos(self): # return self._start_pos - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, + options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: self._generate_goal() return super().reset() diff --git a/alr_envs/alr/classic_control/viapoint_reacher/mp_wrapper.py b/alr_envs/alr/classic_control/viapoint_reacher/mp_wrapper.py index 2b210de..d152f3b 100644 --- a/alr_envs/alr/classic_control/viapoint_reacher/mp_wrapper.py +++ b/alr_envs/alr/classic_control/viapoint_reacher/mp_wrapper.py @@ -7,6 +7,7 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): + @property def context_mask(self): return np.hstack([ [self.env.random_start] * self.env.n_links, # cos @@ -24,7 +25,3 @@ class MPWrapper(RawInterfaceWrapper): @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: return self.env.current_vel - - @property - def dt(self) -> Union[float, int]: - return self.env.dt diff --git a/alr_envs/alr/classic_control/viapoint_reacher/viapoint_reacher.py b/alr_envs/alr/classic_control/viapoint_reacher/viapoint_reacher.py index 292e40a..569ca2c 100644 --- a/alr_envs/alr/classic_control/viapoint_reacher/viapoint_reacher.py +++ b/alr_envs/alr/classic_control/viapoint_reacher/viapoint_reacher.py @@ -1,8 +1,9 @@ -from typing import Iterable, Union +from typing import Iterable, Union, Tuple, Optional import gym import matplotlib.pyplot as plt import numpy as np +from gym.core import ObsType from gym.utils import seeding from alr_envs.alr.classic_control.base_reacher.base_reacher_direct import BaseReacherDirectEnv @@ -40,7 +41,8 @@ class ViaPointReacherEnv(BaseReacherDirectEnv): # def start_pos(self): # return self._start_pos - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, + options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: self._generate_goal() return super().reset() @@ -183,8 +185,10 @@ class ViaPointReacherEnv(BaseReacherDirectEnv): plt.pause(0.01) + if __name__ == "__main__": import time + env = ViaPointReacherEnv(5) env.reset() diff --git a/alr_envs/alr/mujoco/__init__.py b/alr_envs/alr/mujoco/__init__.py index f880578..906a9a5 100644 --- a/alr_envs/alr/mujoco/__init__.py +++ b/alr_envs/alr/mujoco/__init__.py @@ -1,4 +1,4 @@ -from .ant_jump.ant_jump import ALRAntJumpEnv +from .ant_jump.ant_jump import AntJumpEnv from .ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv from .ball_in_a_cup.biac_pd import ALRBallInACupPDEnv from alr_envs.alr.mujoco.beerpong.beerpong import BeerPongEnv diff --git a/alr_envs/alr/mujoco/ant_jump/__init__.py b/alr_envs/alr/mujoco/ant_jump/__init__.py index 8a04a02..c5e6d2f 100644 --- a/alr_envs/alr/mujoco/ant_jump/__init__.py +++ b/alr_envs/alr/mujoco/ant_jump/__init__.py @@ -1 +1 @@ -from .new_mp_wrapper import MPWrapper +from .mp_wrapper import MPWrapper diff --git a/alr_envs/alr/mujoco/ant_jump/ant_jump.py b/alr_envs/alr/mujoco/ant_jump/ant_jump.py index 27098ac..deffcfa 100644 --- a/alr_envs/alr/mujoco/ant_jump/ant_jump.py +++ b/alr_envs/alr/mujoco/ant_jump/ant_jump.py @@ -1,14 +1,19 @@ +from typing import Tuple, Union, Optional + import numpy as np +from gym.core import ObsType from gym.envs.mujoco.ant_v3 import AntEnv MAX_EPISODE_STEPS_ANTJUMP = 200 -# TODO: This environment was not testet yet. Do the following todos and test it. + + +# TODO: This environment was not tested yet. Do the following todos and test it. # TODO: Right now this environment only considers jumping to a specific height, which is not nice. It should be extended # to the same structure as the Hopper, where the angles are randomized (->contexts) and the agent should jump as heigh # as possible, while landing at a specific target position -class ALRAntJumpEnv(AntEnv): +class AntJumpEnv(AntEnv): """ Initialization changes to normal Ant: - healthy_reward: 1.0 -> 0.01 -> 0.0 no healthy reward needed - Paul and Marc @@ -27,17 +32,15 @@ class ALRAntJumpEnv(AntEnv): contact_force_range=(-1.0, 1.0), reset_noise_scale=0.1, exclude_current_positions_from_observation=True, - max_episode_steps=200): + ): self.current_step = 0 self.max_height = 0 - self.max_episode_steps = max_episode_steps self.goal = 0 super().__init__(xml_file, ctrl_cost_weight, contact_cost_weight, healthy_reward, terminate_when_unhealthy, healthy_z_range, contact_force_range, reset_noise_scale, exclude_current_positions_from_observation) def step(self, action): - self.current_step += 1 self.do_simulation(action, self.frame_skip) @@ -52,12 +55,12 @@ class ALRAntJumpEnv(AntEnv): costs = ctrl_cost + contact_cost - done = height < 0.3 # fall over -> is the 0.3 value from healthy_z_range? TODO change 0.3 to the value of healthy z angle + done = height < 0.3 # fall over -> is the 0.3 value from healthy_z_range? TODO change 0.3 to the value of healthy z angle - if self.current_step == self.max_episode_steps or done: + if self.current_step == MAX_EPISODE_STEPS_ANTJUMP or done: # -10 for scaling the value of the distance between the max_height and the goal height; only used when context is enabled # height_reward = -10 * (np.linalg.norm(self.max_height - self.goal)) - height_reward = -10*np.linalg.norm(self.max_height - self.goal) + height_reward = -10 * np.linalg.norm(self.max_height - self.goal) # no healthy reward when using context, because we optimize a negative value healthy_reward = 0 @@ -77,7 +80,8 @@ class ALRAntJumpEnv(AntEnv): def _get_obs(self): return np.append(super()._get_obs(), self.goal) - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, + options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: self.current_step = 0 self.max_height = 0 self.goal = np.random.uniform(1.0, 2.5, @@ -96,19 +100,3 @@ class ALRAntJumpEnv(AntEnv): observation = self._get_obs() return observation - -if __name__ == '__main__': - render_mode = "human" # "human" or "partial" or "final" - env = ALRAntJumpEnv() - obs = env.reset() - - for i in range(2000): - # test with random actions - ac = env.action_space.sample() - obs, rew, d, info = env.step(ac) - if i % 10 == 0: - env.render(mode=render_mode) - if d: - env.reset() - - env.close() diff --git a/alr_envs/alr/mujoco/ant_jump/mp_wrapper.py b/alr_envs/alr/mujoco/ant_jump/mp_wrapper.py index f6e99a3..a481d6f 100644 --- a/alr_envs/alr/mujoco/ant_jump/mp_wrapper.py +++ b/alr_envs/alr/mujoco/ant_jump/mp_wrapper.py @@ -1,4 +1,4 @@ -from typing import Tuple, Union +from typing import Union, Tuple import numpy as np @@ -8,10 +8,10 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): @property - def context_mask(self) -> np.ndarray: + def context_mask(self): return np.hstack([ - [False] * 111, # ant has 111 dimensional observation space !! - [True] # goal height + [False] * 111, # ant has 111 dimensional observation space !! + [True] # goal height ]) @property @@ -21,11 +21,3 @@ class MPWrapper(RawInterfaceWrapper): @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: return self.env.sim.data.qvel[6:14].copy() - - @property - def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: - raise ValueError("Goal position is not available and has to be learnt based on the environment.") - - @property - def dt(self) -> Union[float, int]: - return self.env.dt diff --git a/alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py b/alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py deleted file mode 100644 index f6f026b..0000000 --- a/alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py +++ /dev/null @@ -1,22 +0,0 @@ -from alr_envs.black_box.black_box_wrapper import BlackBoxWrapper -from typing import Union, Tuple -import numpy as np - -from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper - - -class MPWrapper(RawInterfaceWrapper): - - def get_context_mask(self): - return np.hstack([ - [False] * 111, # ant has 111 dimensional observation space !! - [True] # goal height - ]) - - @property - def current_pos(self) -> Union[float, int, np.ndarray]: - return self.env.sim.data.qpos[7:15].copy() - - @property - def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel[6:14].copy() diff --git a/alr_envs/alr/mujoco/alr_reward_fct.py b/alr_envs/alr/mujoco/ball_in_a_cup/alr_reward_fct.py similarity index 100% rename from alr_envs/alr/mujoco/alr_reward_fct.py rename to alr_envs/alr/mujoco/ball_in_a_cup/alr_reward_fct.py diff --git a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py index 4ea4381..b4d0e6e 100644 --- a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py +++ b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py @@ -1,5 +1,5 @@ import numpy as np -from alr_envs.alr.mujoco import alr_reward_fct +from alr_envs.alr.mujoco.ball_in_a_cup import alr_reward_fct class BallInACupReward(alr_reward_fct.AlrReward): diff --git a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py index 0762e22..8044eb8 100644 --- a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py +++ b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py @@ -1,5 +1,5 @@ import numpy as np -from alr_envs.alr.mujoco import alr_reward_fct +from alr_envs.alr.mujoco.ball_in_a_cup import alr_reward_fct class BallInACupReward(alr_reward_fct.AlrReward): diff --git a/alr_envs/alr/mujoco/ball_in_a_cup/biac_pd.py b/alr_envs/alr/mujoco/ball_in_a_cup/biac_pd.py index b047c54..4abfb6c 100644 --- a/alr_envs/alr/mujoco/ball_in_a_cup/biac_pd.py +++ b/alr_envs/alr/mujoco/ball_in_a_cup/biac_pd.py @@ -6,17 +6,6 @@ import mujoco_py.builder import numpy as np from gym import utils -from mp_env_api.mp_wrappers.detpmp_wrapper import DetPMPWrapper -from mp_env_api.utils.policies import PDControllerExtend - - -def make_detpmp_env(**kwargs): - name = kwargs.pop("name") - _env = gym.make(name) - policy = PDControllerExtend(_env, p_gains=kwargs.pop('p_gains'), d_gains=kwargs.pop('d_gains')) - kwargs['policy_type'] = policy - return DetPMPWrapper(_env, **kwargs) - class ALRBallInACupPDEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self, frame_skip=4, apply_gravity_comp=True, simplified: bool = False, diff --git a/alr_envs/alr/mujoco/beerpong/__init__.py b/alr_envs/alr/mujoco/beerpong/__init__.py index 8a04a02..c5e6d2f 100644 --- a/alr_envs/alr/mujoco/beerpong/__init__.py +++ b/alr_envs/alr/mujoco/beerpong/__init__.py @@ -1 +1 @@ -from .new_mp_wrapper import MPWrapper +from .mp_wrapper import MPWrapper diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py index 9adab60..fd820eb 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong.py @@ -1,12 +1,11 @@ import os +from typing import Optional import mujoco_py.builder import numpy as np from gym import utils from gym.envs.mujoco import MujocoEnv -from alr_envs.alr.mujoco.beerpong.deprecated.beerpong_reward_staged import BeerPongReward - # XML Variables ROBOT_COLLISION_OBJ = ["wrist_palm_link_convex_geom", "wrist_pitch_link_convex_decomposition_p1_geom", @@ -76,7 +75,7 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): def start_vel(self): return self._start_vel - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None): self.dists = [] self.dists_final = [] self.action_costs = [] diff --git a/alr_envs/alr/mujoco/beerpong/mp_wrapper.py b/alr_envs/alr/mujoco/beerpong/mp_wrapper.py index 20e7532..e69d4f9 100644 --- a/alr_envs/alr/mujoco/beerpong/mp_wrapper.py +++ b/alr_envs/alr/mujoco/beerpong/mp_wrapper.py @@ -1,4 +1,4 @@ -from typing import Tuple, Union +from typing import Union, Tuple import numpy as np @@ -7,34 +7,36 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): - @property - def context_mask(self) -> np.ndarray: + def get_context_mask(self): return np.hstack([ [False] * 7, # cos [False] * 7, # sin - [False] * 7, # joint velocities - [False] * 3, # cup_goal_diff_final - [False] * 3, # cup_goal_diff_top + [False] * 7, # joint velocities + [False] * 3, # cup_goal_diff_final + [False] * 3, # cup_goal_diff_top [True] * 2, # xy position of cup [False] # env steps ]) - @property - def start_pos(self): - return self._start_pos - @property def current_pos(self) -> Union[float, int, np.ndarray, Tuple]: - return self.sim.data.qpos[0:7].copy() + return self.env.sim.data.qpos[0:7].copy() @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.sim.data.qvel[0:7].copy() + return self.env.sim.data.qvel[0:7].copy() - @property - def goal_pos(self): - raise ValueError("Goal position is not available and has to be learnt based on the environment.") + # TODO: Fix this + def _episode_callback(self, action: np.ndarray, mp) -> Tuple[np.ndarray, Union[np.ndarray, None]]: + if mp.learn_tau: + self.env.env.release_step = action[0] / self.env.dt # Tau value + return action, None + else: + return action, None - @property - def dt(self) -> Union[float, int]: - return self.env.dt + def set_context(self, context): + xyz = np.zeros(3) + xyz[:2] = context + xyz[-1] = 0.840 + self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz + return self.get_observation_from_step(self.env.env._get_obs()) diff --git a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py deleted file mode 100644 index e447a49..0000000 --- a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py +++ /dev/null @@ -1,41 +0,0 @@ -from typing import Union, Tuple - -import numpy as np - -from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper - - -class MPWrapper(RawInterfaceWrapper): - @property - def current_pos(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qpos[0:7].copy() - - @property - def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel[0:7].copy() - - def get_context_mask(self): - return np.hstack([ - [False] * 7, # cos - [False] * 7, # sin - [False] * 7, # joint velocities - [False] * 3, # cup_goal_diff_final - [False] * 3, # cup_goal_diff_top - [True] * 2, # xy position of cup - [False] # env steps - ]) - - # TODO: Fix this - def _episode_callback(self, action: np.ndarray, mp) -> Tuple[np.ndarray, Union[np.ndarray, None]]: - if mp.learn_tau: - self.env.env.release_step = action[0] / self.env.dt # Tau value - return action, None - else: - return action, None - - def set_context(self, context): - xyz = np.zeros(3) - xyz[:2] = context - xyz[-1] = 0.840 - self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz - return self.get_observation_from_step(self.env.env._get_obs()) diff --git a/alr_envs/alr/mujoco/half_cheetah_jump/__init__.py b/alr_envs/alr/mujoco/half_cheetah_jump/__init__.py index 8a04a02..c5e6d2f 100644 --- a/alr_envs/alr/mujoco/half_cheetah_jump/__init__.py +++ b/alr_envs/alr/mujoco/half_cheetah_jump/__init__.py @@ -1 +1 @@ -from .new_mp_wrapper import MPWrapper +from .mp_wrapper import MPWrapper diff --git a/alr_envs/alr/mujoco/half_cheetah_jump/half_cheetah_jump.py b/alr_envs/alr/mujoco/half_cheetah_jump/half_cheetah_jump.py index 4b53a5d..a90edf6 100644 --- a/alr_envs/alr/mujoco/half_cheetah_jump/half_cheetah_jump.py +++ b/alr_envs/alr/mujoco/half_cheetah_jump/half_cheetah_jump.py @@ -1,4 +1,7 @@ import os +from typing import Tuple, Union, Optional + +from gym.core import ObsType from gym.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv import numpy as np @@ -20,7 +23,7 @@ class ALRHalfCheetahJumpEnv(HalfCheetahEnv): max_episode_steps=100): self.current_step = 0 self.max_height = 0 - self.max_episode_steps = max_episode_steps + # self.max_episode_steps = max_episode_steps self.goal = 0 self.context = context xml_file = os.path.join(os.path.dirname(__file__), "assets", xml_file) @@ -37,15 +40,15 @@ class ALRHalfCheetahJumpEnv(HalfCheetahEnv): ## Didnt use fell_over, because base env also has no done condition - Paul and Marc # fell_over = abs(self.sim.data.qpos[2]) > 2.5 # how to figure out if the cheetah fell over? -> 2.5 oke? - # TODO: Should a fall over be checked herE? + # TODO: Should a fall over be checked here? done = False ctrl_cost = self.control_cost(action) costs = ctrl_cost - if self.current_step == self.max_episode_steps: - height_goal_distance = -10*np.linalg.norm(self.max_height - self.goal) + 1e-8 if self.context \ - else self.max_height + if self.current_step == MAX_EPISODE_STEPS_HALFCHEETAHJUMP: + height_goal_distance = -10 * np.linalg.norm(self.max_height - self.goal) + 1e-8 if self.context \ + else self.max_height rewards = self._forward_reward_weight * height_goal_distance else: rewards = 0 @@ -62,7 +65,8 @@ class ALRHalfCheetahJumpEnv(HalfCheetahEnv): def _get_obs(self): return np.append(super()._get_obs(), self.goal) - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, + options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: self.max_height = 0 self.current_step = 0 self.goal = np.random.uniform(1.1, 1.6, 1) # 1.1 1.6 @@ -80,21 +84,3 @@ class ALRHalfCheetahJumpEnv(HalfCheetahEnv): observation = self._get_obs() return observation - -if __name__ == '__main__': - render_mode = "human" # "human" or "partial" or "final" - env = ALRHalfCheetahJumpEnv() - obs = env.reset() - - for i in range(2000): - # objective.load_result("/tmp/cma") - # test with random actions - ac = env.action_space.sample() - obs, rew, d, info = env.step(ac) - if i % 10 == 0: - env.render(mode=render_mode) - if d: - print('After ', i, ' steps, done: ', d) - env.reset() - - env.close() \ No newline at end of file diff --git a/alr_envs/alr/mujoco/half_cheetah_jump/mp_wrapper.py b/alr_envs/alr/mujoco/half_cheetah_jump/mp_wrapper.py index 930da6d..298bf53 100644 --- a/alr_envs/alr/mujoco/half_cheetah_jump/mp_wrapper.py +++ b/alr_envs/alr/mujoco/half_cheetah_jump/mp_wrapper.py @@ -10,7 +10,7 @@ class MPWrapper(RawInterfaceWrapper): def context_mask(self) -> np.ndarray: return np.hstack([ [False] * 17, - [True] # goal height + [True] # goal height ]) @property @@ -20,11 +20,3 @@ class MPWrapper(RawInterfaceWrapper): @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: return self.env.sim.data.qvel[3:9].copy() - - @property - def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: - raise ValueError("Goal position is not available and has to be learnt based on the environment.") - - @property - def dt(self) -> Union[float, int]: - return self.env.dt diff --git a/alr_envs/alr/mujoco/half_cheetah_jump/new_mp_wrapper.py b/alr_envs/alr/mujoco/half_cheetah_jump/new_mp_wrapper.py deleted file mode 100644 index 9a65952..0000000 --- a/alr_envs/alr/mujoco/half_cheetah_jump/new_mp_wrapper.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Tuple, Union - -import numpy as np - -from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper - - -class MPWrapper(RawInterfaceWrapper): - def context_mask(self): - return np.hstack([ - [False] * 17, - [True] # goal height - ]) - - @property - def current_pos(self) -> Union[float, int, np.ndarray]: - return self.env.sim.data.qpos[3:9].copy() - - @property - def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel[3:9].copy() - diff --git a/alr_envs/alr/mujoco/hopper_jump/__init__.py b/alr_envs/alr/mujoco/hopper_jump/__init__.py index c5e6d2f..e901144 100644 --- a/alr_envs/alr/mujoco/hopper_jump/__init__.py +++ b/alr_envs/alr/mujoco/hopper_jump/__init__.py @@ -1 +1,2 @@ from .mp_wrapper import MPWrapper + diff --git a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py index 78b06d3..7409300 100644 --- a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py +++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py @@ -1,3 +1,4 @@ +import copy from typing import Optional from gym.envs.mujoco.hopper_v3 import HopperEnv @@ -7,10 +8,11 @@ import os MAX_EPISODE_STEPS_HOPPERJUMP = 250 -class ALRHopperJumpEnv(HopperEnv): +class HopperJumpEnv(HopperEnv): """ Initialization changes to normal Hopper: - terminate_when_unhealthy: True -> False + - healthy_reward: 1.0 -> 2.0 - healthy_z_range: (0.7, float('inf')) -> (0.5, float('inf')) - healthy_angle_range: (-0.2, 0.2) -> (-float('inf'), float('inf')) - exclude_current_positions_from_observation: True -> False @@ -21,24 +23,28 @@ class ALRHopperJumpEnv(HopperEnv): xml_file='hopper_jump.xml', forward_reward_weight=1.0, ctrl_cost_weight=1e-3, - healthy_reward=1.0, - penalty=0.0, + healthy_reward=2.0, # 1 step + contact_weight=2.0, # 0 step + height_weight=10.0, # 3 step + dist_weight=3.0, # 3 step terminate_when_unhealthy=False, healthy_state_range=(-100.0, 100.0), healthy_z_range=(0.5, float('inf')), healthy_angle_range=(-float('inf'), float('inf')), reset_noise_scale=5e-3, exclude_current_positions_from_observation=False, + sparse=False, ): - self._steps = 0 + self.sparse = sparse + self._height_weight = height_weight + self._dist_weight = dist_weight + self._contact_weight = contact_weight + self.max_height = 0 - # self.penalty = penalty self.goal = 0 - self._floor_geom_id = None - self._foot_geom_id = None - + self._steps = 0 self.contact_with_floor = False self.init_floor_contact = False self.has_left_floor = False @@ -49,12 +55,12 @@ class ALRHopperJumpEnv(HopperEnv): healthy_state_range, healthy_z_range, healthy_angle_range, reset_noise_scale, exclude_current_positions_from_observation) + # increase initial height + self.init_qpos[1] = 1.5 + def step(self, action): self._steps += 1 - self._floor_geom_id = self.model.geom_name2id('floor') - self._foot_geom_id = self.model.geom_name2id('foot_geom') - self.do_simulation(action, self.frame_skip) height_after = self.get_body_com("torso")[2] @@ -73,18 +79,19 @@ class ALRHopperJumpEnv(HopperEnv): ctrl_cost = self.control_cost(action) costs = ctrl_cost done = False - goal_dist = np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0])) + goal_dist = np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0])) if self.contact_dist is None and self.contact_with_floor: self.contact_dist = goal_dist rewards = 0 - if self._steps >= MAX_EPISODE_STEPS_HOPPERJUMP: - # healthy_reward = 0 if self.context else self.healthy_reward * self._steps - healthy_reward = self.healthy_reward * 2 # * self._steps - contact_dist = self.contact_dist if self.contact_dist is not None else 5 - dist_reward = self._forward_reward_weight * (-3 * goal_dist + 10 * self.max_height - 2 * contact_dist) - rewards = dist_reward + healthy_reward + if not self.sparse or (self.sparse and self._steps >= MAX_EPISODE_STEPS_HOPPERJUMP): + healthy_reward = self.healthy_reward + distance_reward = goal_dist * self._dist_weight + height_reward = (self.max_height if self.sparse else self.get_body_com("torso")[2]) * self._height_weight + contact_reward = (self.contact_dist or 5) * self._contact_weight + # dist_reward = self._forward_reward_weight * (-3 * goal_dist + 10 * self.max_height - 2 * contact_dist) + rewards = self._forward_reward_weight * (distance_reward + height_reward + contact_reward + healthy_reward) observation = self._get_obs() reward = rewards - costs @@ -97,24 +104,40 @@ class ALRHopperJumpEnv(HopperEnv): height_rew=self.max_height, healthy_reward=self.healthy_reward * 2, healthy=self.is_healthy, - contact_dist=self.contact_dist if self.contact_dist is not None else 0 + contact_dist=self.contact_dist or 0 ) return observation, reward, done, info def _get_obs(self): - return np.append(super()._get_obs(), self.goal) + goal_dist = self.data.get_site_xpos('foot_site') - np.array([self.goal, 0, 0]) + return np.concatenate((super(HopperJumpEnv, self)._get_obs(), goal_dist.copy(), self.goal.copy())) - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None, ): - self.goal = self.np_random.uniform(1.4, 2.16, 1)[0] # 1.3 2.3 + def reset_model(self): + super(HopperJumpEnv, self).reset_model() + + self.goal = self.np_random.uniform(0.3, 1.35, 1)[0] + self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0]) self.max_height = 0 self._steps = 0 - return super().reset() - # overwrite reset_model to make it deterministic - def reset_model(self): + noise_low = -np.zeros(self.model.nq) + noise_low[3] = -0.5 + noise_low[4] = -0.2 + noise_low[5] = 0 - qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) - qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + noise_high = np.zeros(self.model.nq) + noise_high[3] = 0 + noise_high[4] = 0 + noise_high[5] = 0.785 + + qpos = ( + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + + self.init_qpos + ) + qvel = ( + # self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + + self.init_qvel + ) self.set_state(qpos, qvel) @@ -123,6 +146,7 @@ class ALRHopperJumpEnv(HopperEnv): self.contact_with_floor = False self.init_floor_contact = False self.contact_dist = None + return observation def _is_floor_foot_contact(self): @@ -137,184 +161,75 @@ class ALRHopperJumpEnv(HopperEnv): return False -class ALRHopperXYJumpEnv(ALRHopperJumpEnv): +class HopperJumpStepEnv(HopperJumpEnv): + + def __init__(self, + xml_file='hopper_jump.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=1.0, + height_weight=3, + dist_weight=3, + terminate_when_unhealthy=False, + healthy_state_range=(-100.0, 100.0), + healthy_z_range=(0.5, float('inf')), + healthy_angle_range=(-float('inf'), float('inf')), + reset_noise_scale=5e-3, + exclude_current_positions_from_observation=False + ): + + self._height_weight = height_weight + self._dist_weight = dist_weight + super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, terminate_when_unhealthy, + healthy_state_range, healthy_z_range, healthy_angle_range, reset_noise_scale, + exclude_current_positions_from_observation) def step(self, action): - self._floor_geom_id = self.model.geom_name2id('floor') - self._foot_geom_id = self.model.geom_name2id('foot_geom') - self._steps += 1 + self.do_simulation(action, self.frame_skip) + height_after = self.get_body_com("torso")[2] - site_pos_after = self.sim.data.site_xpos[self.model.site_name2id('foot_site')].copy() + site_pos_after = self.data.get_site_xpos('foot_site') self.max_height = max(height_after, self.max_height) - # floor_contact = self._contact_checker(self._floor_geom_id, self._foot_geom_id) if not self.contact_with_floor else False - # self.init_floor_contact = floor_contact if not self.init_floor_contact else self.init_floor_contact - # self.has_left_floor = not floor_contact if self.init_floor_contact and not self.has_left_floor else self.has_left_floor - # self.contact_with_floor = floor_contact if not self.contact_with_floor and self.has_left_floor else self.contact_with_floor - - floor_contact = self._is_floor_foot_contact(self._floor_geom_id, - self._foot_geom_id) if not self.contact_with_floor else False - if not self.init_floor_contact: - self.init_floor_contact = floor_contact - if self.init_floor_contact and not self.has_left_floor: - self.has_left_floor = not floor_contact - if not self.contact_with_floor and self.has_left_floor: - self.contact_with_floor = floor_contact - - if self.contact_dist is None and self.contact_with_floor: - self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')] - - np.array([self.goal, 0, 0])) - ctrl_cost = self.control_cost(action) + healthy_reward = self.healthy_reward + height_reward = self._height_weight * height_after + goal_dist = np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0])) + goal_dist_reward = -self._dist_weight * goal_dist + dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward) + + rewards = dist_reward + healthy_reward costs = ctrl_cost done = False - goal_dist = np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0])) - rewards = 0 - if self._steps >= self.max_episode_steps: - # healthy_reward = 0 if self.context else self.healthy_reward * self._steps - healthy_reward = self.healthy_reward * 2 # * self._steps - contact_dist = self.contact_dist if self.contact_dist is not None else 5 - dist_reward = self._forward_reward_weight * (-3 * goal_dist + 10 * self.max_height - 2 * contact_dist) - rewards = dist_reward + healthy_reward + + # This is only for logging the distance to goal when first having the contact + has_floor_contact = self._is_floor_foot_contact() if not self.contact_with_floor else False + + if not self.init_floor_contact: + self.init_floor_contact = has_floor_contact + if self.init_floor_contact and not self.has_left_floor: + self.has_left_floor = not has_floor_contact + if not self.contact_with_floor and self.has_left_floor: + self.contact_with_floor = has_floor_contact + + if self.contact_dist is None and self.contact_with_floor: + self.contact_dist = goal_dist + + ############################################################## observation = self._get_obs() reward = rewards - costs info = { 'height': height_after, 'x_pos': site_pos_after, - 'max_height': self.max_height, - 'goal': self.goal, + 'max_height': copy.copy(self.max_height), + 'goal': copy.copy(self.goal), 'goal_dist': goal_dist, - 'height_rew': self.max_height, - 'healthy_reward': self.healthy_reward * 2, - 'healthy': self.is_healthy, - 'contact_dist': self.contact_dist if self.contact_dist is not None else 0 - } - return observation, reward, done, info - - def reset_model(self): - self.init_qpos[1] = 1.5 - self._floor_geom_id = self.model.geom_name2id('floor') - self._foot_geom_id = self.model.geom_name2id('foot_geom') - noise_low = -np.zeros(self.model.nq) - noise_low[3] = -0.5 - noise_low[4] = -0.2 - noise_low[5] = 0 - - noise_high = np.zeros(self.model.nq) - noise_high[3] = 0 - noise_high[4] = 0 - noise_high[5] = 0.785 - - rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) - qpos = self.init_qpos + rnd_vec - qvel = self.init_qvel - self.set_state(qpos, qvel) - - observation = self._get_obs() - self.has_left_floor = False - self.contact_with_floor = False - self.init_floor_contact = False - self.contact_dist = None - - return observation - - def reset(self): - super().reset() - self.goal = self.np_random.uniform(0.3, 1.35, 1)[0] - self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0]) - return self.reset_model() - - def _get_obs(self): - goal_diff = self.sim.data.site_xpos[self.model.site_name2id('foot_site')].copy() \ - - np.array([self.goal, 0, 0]) - return np.concatenate((super(ALRHopperXYJumpEnv, self)._get_obs(), goal_diff)) - - def set_context(self, context): - # context is 4 dimensional - qpos = self.init_qpos - qvel = self.init_qvel - qpos[-3:] = context[:3] - self.goal = context[-1] - self.set_state(qpos, qvel) - self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0]) - return self._get_obs() - - -class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): - - def __init__( - self, - xml_file='hopper_jump.xml', - forward_reward_weight=1.0, - ctrl_cost_weight=1e-3, - healthy_reward=0.0, - penalty=0.0, - context=True, - terminate_when_unhealthy=False, - healthy_state_range=(-100.0, 100.0), - healthy_z_range=(0.5, float('inf')), - healthy_angle_range=(-float('inf'), float('inf')), - reset_noise_scale=5e-3, - exclude_current_positions_from_observation=False, - max_episode_steps=250, - height_scale=10, - dist_scale=3, - healthy_scale=2 - ): - self.height_scale = height_scale - self.dist_scale = dist_scale - self.healthy_scale = healthy_scale - super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, penalty, context, - terminate_when_unhealthy, healthy_state_range, healthy_z_range, healthy_angle_range, - reset_noise_scale, exclude_current_positions_from_observation, max_episode_steps) - - def step(self, action): - self._floor_geom_id = self.model.geom_name2id('floor') - self._foot_geom_id = self.model.geom_name2id('foot_geom') - - self._steps += 1 - self.do_simulation(action, self.frame_skip) - height_after = self.get_body_com("torso")[2] - site_pos_after = self.sim.data.site_xpos[self.model.site_name2id('foot_site')].copy() - self.max_height = max(height_after, self.max_height) - ctrl_cost = self.control_cost(action) - - healthy_reward = self.healthy_reward * self.healthy_scale - height_reward = self.height_scale * height_after - goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0] - goal_dist_reward = -self.dist_scale * goal_dist - dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward) - reward = -ctrl_cost + healthy_reward + dist_reward - done = False - observation = self._get_obs() - - ########################################################### - # This is only for logging the distance to goal when first having the contact - ########################################################## - floor_contact = self._is_floor_foot_contact(self._floor_geom_id, - self._foot_geom_id) if not self.contact_with_floor else False - if not self.init_floor_contact: - self.init_floor_contact = floor_contact - if self.init_floor_contact and not self.has_left_floor: - self.has_left_floor = not floor_contact - if not self.contact_with_floor and self.has_left_floor: - self.contact_with_floor = floor_contact - - if self.contact_dist is None and self.contact_with_floor: - self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')] - - np.array([self.goal, 0, 0])) - info = { - 'height': height_after, - 'x_pos': site_pos_after, - 'max_height': self.max_height, - 'goal': self.goal, - 'goal_dist': goal_dist, - 'height_rew': self.max_height, - 'healthy_reward': self.healthy_reward * self.healthy_reward, - 'healthy': self.is_healthy, - 'contact_dist': self.contact_dist if self.contact_dist is not None else 0 + 'height_rew': height_reward, + 'healthy_reward': healthy_reward, + 'healthy': copy.copy(self.is_healthy), + 'contact_dist': copy.copy(self.contact_dist) or 0 } return observation, reward, done, info diff --git a/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py b/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py index c7b16db..394893e 100644 --- a/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py +++ b/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py @@ -8,6 +8,7 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): # Random x goal + random init pos + @property def context_mask(self): return np.hstack([ [False] * (2 + int(not self.exclude_current_positions_from_observation)), # position diff --git a/alr_envs/alr/mujoco/hopper_throw/mp_wrapper.py b/alr_envs/alr/mujoco/hopper_throw/mp_wrapper.py index 7778e8c..78c2f51 100644 --- a/alr_envs/alr/mujoco/hopper_throw/mp_wrapper.py +++ b/alr_envs/alr/mujoco/hopper_throw/mp_wrapper.py @@ -6,8 +6,9 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): + @property - def context_mask(self) -> np.ndarray: + def context_mask(self): return np.hstack([ [False] * 17, [True] # goal pos @@ -15,16 +16,8 @@ class MPWrapper(RawInterfaceWrapper): @property def current_pos(self) -> Union[float, int, np.ndarray]: - return self.env.sim.data.qpos[3:6].copy() + return self.env.data.qpos[3:6].copy() @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel[3:6].copy() - - @property - def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: - raise ValueError("Goal position is not available and has to be learnt based on the environment.") - - @property - def dt(self) -> Union[float, int]: - return self.env.dt + return self.env.data.qvel[3:6].copy() diff --git a/alr_envs/alr/mujoco/hopper_throw/new_mp_wrapper.py b/alr_envs/alr/mujoco/hopper_throw/new_mp_wrapper.py deleted file mode 100644 index 01d87a4..0000000 --- a/alr_envs/alr/mujoco/hopper_throw/new_mp_wrapper.py +++ /dev/null @@ -1,25 +0,0 @@ -from typing import Tuple, Union - -import numpy as np - -from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper - - -class MPWrapper(RawInterfaceWrapper): - def context_mask(self): - return np.hstack([ - [False] * 17, - [True] # goal pos - ]) - - @property - def current_pos(self) -> Union[float, int, np.ndarray]: - return self.env.sim.data.qpos[3:6].copy() - - @property - def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel[3:6].copy() - - @property - def dt(self) -> Union[float, int]: - return self.env.dt diff --git a/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml b/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml index 25a3208..742f45c 100644 --- a/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml +++ b/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml @@ -41,7 +41,7 @@
-