From 6704c9d63a5bce2a517af4450a004dfcacd2e6d9 Mon Sep 17 00:00:00 2001 From: Fabian Date: Wed, 6 Jul 2022 09:05:35 +0200 Subject: [PATCH] mp wrapper fixes --- alr_envs/alr/__init__.py | 53 ++-- .../base_reacher/base_reacher.py | 9 +- .../hole_reacher/hole_reacher.py | 7 +- .../hole_reacher/mp_wrapper.py | 5 +- .../simple_reacher/mp_wrapper.py | 5 +- .../simple_reacher/simple_reacher.py | 8 +- .../viapoint_reacher/mp_wrapper.py | 5 +- .../viapoint_reacher/viapoint_reacher.py | 8 +- alr_envs/alr/mujoco/__init__.py | 2 +- alr_envs/alr/mujoco/ant_jump/__init__.py | 2 +- alr_envs/alr/mujoco/ant_jump/ant_jump.py | 38 +-- alr_envs/alr/mujoco/ant_jump/mp_wrapper.py | 16 +- .../alr/mujoco/ant_jump/new_mp_wrapper.py | 22 -- .../{ => ball_in_a_cup}/alr_reward_fct.py | 0 .../ball_in_a_cup/ball_in_a_cup_reward.py | 2 +- .../ball_in_a_cup_reward_simple.py | 2 +- alr_envs/alr/mujoco/ball_in_a_cup/biac_pd.py | 11 - alr_envs/alr/mujoco/beerpong/__init__.py | 2 +- alr_envs/alr/mujoco/beerpong/beerpong.py | 5 +- alr_envs/alr/mujoco/beerpong/mp_wrapper.py | 38 +-- .../alr/mujoco/beerpong/new_mp_wrapper.py | 41 --- .../alr/mujoco/half_cheetah_jump/__init__.py | 2 +- .../half_cheetah_jump/half_cheetah_jump.py | 34 +- .../mujoco/half_cheetah_jump/mp_wrapper.py | 10 +- .../half_cheetah_jump/new_mp_wrapper.py | 22 -- alr_envs/alr/mujoco/hopper_jump/__init__.py | 1 + .../alr/mujoco/hopper_jump/hopper_jump.py | 293 +++++++----------- alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py | 1 + .../alr/mujoco/hopper_throw/mp_wrapper.py | 15 +- .../alr/mujoco/hopper_throw/new_mp_wrapper.py | 25 -- .../mujoco/reacher/assets/reacher_5links.xml | 2 +- alr_envs/alr/mujoco/reacher/mp_wrapper.py | 7 +- alr_envs/alr/mujoco/reacher/reacher.py | 4 +- .../alr/mujoco/walker_2d_jump/__init__.py | 2 +- .../alr/mujoco/walker_2d_jump/mp_wrapper.py | 17 +- .../mujoco/walker_2d_jump/new_mp_wrapper.py | 26 -- alr_envs/black_box/black_box_wrapper.py | 71 +++-- alr_envs/black_box/raw_interface_wrapper.py | 20 +- alr_envs/examples/examples_general.py | 4 +- .../examples/examples_movement_primitives.py | 33 +- alr_envs/utils/make_env_helpers.py | 8 +- alr_envs/utils/utils.py | 3 +- setup.py | 29 +- 43 files changed, 302 insertions(+), 608 deletions(-) delete mode 100644 alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py rename alr_envs/alr/mujoco/{ => ball_in_a_cup}/alr_reward_fct.py (100%) delete mode 100644 alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py delete mode 100644 alr_envs/alr/mujoco/half_cheetah_jump/new_mp_wrapper.py delete mode 100644 alr_envs/alr/mujoco/hopper_throw/new_mp_wrapper.py delete mode 100644 alr_envs/alr/mujoco/walker_2d_jump/new_mp_wrapper.py diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index 1bcb02e..cdff3a1 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -118,33 +118,30 @@ for _dims in [5, 7]: } ) -## Hopper Jump random joints and desired position register( id='HopperJumpSparse-v0', - entry_point='alr_envs.alr.mujoco:ALRHopperXYJumpEnv', + entry_point='alr_envs.alr.mujoco:HopperJumpEnv', max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMP, kwargs={ - # "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP, "sparse": True, - # "healthy_reward": 1.0 } ) -## Hopper Jump random joints and desired position step based reward register( id='HopperJump-v0', - entry_point='alr_envs.alr.mujoco:ALRHopperXYJumpEnvStepBased', + entry_point='alr_envs.alr.mujoco:HopperJumpEnv', max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMP, kwargs={ - # "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP, "sparse": False, - # "healthy_reward": 1.0 + "healthy_reward": 1.0, + "contact_weight": 0.0, + "height_weight": 3.0, } ) register( id='ALRAntJump-v0', - entry_point='alr_envs.alr.mujoco:ALRAntJumpEnv', + entry_point='alr_envs.alr.mujoco:AntJumpEnv', max_episode_steps=MAX_EPISODE_STEPS_ANTJUMP, kwargs={ "max_episode_steps": MAX_EPISODE_STEPS_ANTJUMP, @@ -266,7 +263,7 @@ for _v in _versions: entry_point='alr_envs.utils.make_env_helpers:make_dmp_env_helper', # max_episode_steps=1, kwargs={ - "name": f"alr_envs:{_v}", + "name": f"{_v}", "wrappers": [classic_control.simple_reacher.MPWrapper], "traj_gen_kwargs": { "num_dof": 2 if "long" not in _v.lower() else 5, @@ -304,7 +301,7 @@ register( entry_point='alr_envs.utils.make_env_helpers:make_dmp_env_helper', # max_episode_steps=1, kwargs={ - "name": "alr_envs:ViaPointReacher-v0", + "name": "ViaPointReacher-v0", "wrappers": [classic_control.viapoint_reacher.MPWrapper], "traj_gen_kwargs": { "num_dof": 5, @@ -340,7 +337,7 @@ for _v in _versions: entry_point='alr_envs.utils.make_env_helpers:make_dmp_env_helper', # max_episode_steps=1, kwargs={ - "name": f"alr_envs:HoleReacher-{_v}", + "name": f"HoleReacher-{_v}", "wrappers": [classic_control.hole_reacher.MPWrapper], "traj_gen_kwargs": { "num_dof": 5, @@ -362,7 +359,7 @@ for _v in _versions: kwargs_dict_hole_reacher_promp['wrappers'].append(classic_control.hole_reacher.MPWrapper) kwargs_dict_hole_reacher_promp['trajectory_generator_kwargs']['weight_scale'] = 2 kwargs_dict_hole_reacher_promp['controller_kwargs']['controller_type'] = 'velocity' - kwargs_dict_hole_reacher_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_hole_reacher_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', @@ -370,8 +367,8 @@ for _v in _versions: ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) -## ALRReacher -_versions = ["ALRReacher-v0", "ALRLongReacher-v0", "ALRReacherSparse-v0", "ALRLongReacherSparse-v0"] +## ReacherNd +_versions = ["Reacher5d-v0", "Reacher7d-v0", "Reacher5dSparse-v0", "Reacher7dSparse-v0"] for _v in _versions: _name = _v.split("-") _env_id = f'{_name[0]}DMP-{_name[1]}' @@ -380,7 +377,7 @@ for _v in _versions: entry_point='alr_envs.utils.make_env_helpers:make_dmp_env_helper', # max_episode_steps=1, kwargs={ - "name": f"alr_envs:{_v}", + "name": f"{_v}", "wrappers": [mujoco.reacher.MPWrapper], "traj_gen_kwargs": { "num_dof": 5 if "long" not in _v.lower() else 7, @@ -404,10 +401,10 @@ for _v in _versions: kwargs_dict_alr_reacher_promp['wrappers'].append(mujoco.reacher.MPWrapper) kwargs_dict_alr_reacher_promp['controller_kwargs']['p_gains'] = 1 kwargs_dict_alr_reacher_promp['controller_kwargs']['d_gains'] = 0.1 - kwargs_dict_alr_reacher_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_alr_reacher_promp['name'] = f"{_v}" register( id=_env_id, - entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + entry_point='alr_envs.utils.make_env_helpers:make_bb_env_helper', kwargs=kwargs_dict_alr_reacher_promp ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) @@ -425,7 +422,7 @@ for _v in _versions: kwargs_dict_bp_promp['controller_kwargs']['d_gains'] = np.array([0.02333333, 0.1, 0.0625, 0.08, 0.03, 0.03, 0.0125]) kwargs_dict_bp_promp['basis_generator_kwargs']['num_basis'] = 2 kwargs_dict_bp_promp['basis_generator_kwargs']['num_basis_zero_start'] = 2 - kwargs_dict_bp_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_bp_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_bb_env_helper', @@ -445,7 +442,7 @@ for _v in _versions: kwargs_dict_bp_promp['controller_kwargs']['d_gains'] = np.array([0.02333333, 0.1, 0.0625, 0.08, 0.03, 0.03, 0.0125]) kwargs_dict_bp_promp['basis_generator_kwargs']['num_basis'] = 2 kwargs_dict_bp_promp['basis_generator_kwargs']['num_basis_zero_start'] = 2 - kwargs_dict_bp_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_bp_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper', @@ -465,7 +462,7 @@ for _v in _versions: _env_id = f'{_name[0]}ProMP-{_name[1]}' kwargs_dict_ant_jump_promp = deepcopy(DEFAULT_BB_DICT) kwargs_dict_ant_jump_promp['wrappers'].append(mujoco.ant_jump.MPWrapper) - kwargs_dict_ant_jump_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_ant_jump_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper', @@ -482,7 +479,7 @@ for _v in _versions: _env_id = f'{_name[0]}ProMP-{_name[1]}' kwargs_dict_halfcheetah_jump_promp = deepcopy(DEFAULT_BB_DICT) kwargs_dict_halfcheetah_jump_promp['wrappers'].append(mujoco.half_cheetah_jump.MPWrapper) - kwargs_dict_halfcheetah_jump_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_halfcheetah_jump_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', @@ -502,7 +499,7 @@ for _v in _versions: _env_id = f'{_name[0]}ProMP-{_name[1]}' kwargs_dict_hopper_jump_promp = deepcopy(DEFAULT_BB_DICT) kwargs_dict_hopper_jump_promp['wrappers'].append(mujoco.hopper_jump.MPWrapper) - kwargs_dict_hopper_jump_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_hopper_jump_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper', @@ -520,7 +517,7 @@ for _v in _versions: _env_id = f'{_name[0]}ProMP-{_name[1]}' kwargs_dict_walker2d_jump_promp = deepcopy(DEFAULT_BB_DICT) kwargs_dict_walker2d_jump_promp['wrappers'].append(mujoco.walker_2d_jump.MPWrapper) - kwargs_dict_walker2d_jump_promp['name'] = f"alr_envs:{_v}" + kwargs_dict_walker2d_jump_promp['name'] = f"{_v}" register( id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', @@ -583,7 +580,7 @@ register( # CtxtFree are v0, Contextual are v1 register( id='ALRAntJump-v0', - entry_point='alr_envs.alr.mujoco:ALRAntJumpEnv', + entry_point='alr_envs.alr.mujoco:AntJumpEnv', max_episode_steps=MAX_EPISODE_STEPS_ANTJUMP, kwargs={ "max_episode_steps": MAX_EPISODE_STEPS_ANTJUMP, @@ -602,7 +599,7 @@ register( ) register( id='ALRHopperJump-v0', - entry_point='alr_envs.alr.mujoco:ALRHopperJumpEnv', + entry_point='alr_envs.alr.mujoco:HopperJumpEnv', max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMP, kwargs={ "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP, @@ -649,7 +646,7 @@ for i in _vs: id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', kwargs={ - "name": f"alr_envs:{_env_id.replace('ProMP', '')}", + "name": f"{_env_id.replace('ProMP', '')}", "wrappers": [mujoco.reacher.MPWrapper], "mp_kwargs": { "num_dof": 5, @@ -672,7 +669,7 @@ for i in _vs: id=_env_id, entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', kwargs={ - "name": f"alr_envs:{_env_id.replace('ProMP', '')}", + "name": f"{_env_id.replace('ProMP', '')}", "wrappers": [mujoco.reacher.MPWrapper], "mp_kwargs": { "num_dof": 5, diff --git a/alr_envs/alr/classic_control/base_reacher/base_reacher.py b/alr_envs/alr/classic_control/base_reacher/base_reacher.py index e76ce85..1af8187 100644 --- a/alr_envs/alr/classic_control/base_reacher/base_reacher.py +++ b/alr_envs/alr/classic_control/base_reacher/base_reacher.py @@ -1,9 +1,10 @@ from abc import ABC, abstractmethod -from typing import Union +from typing import Union, Tuple, Optional import gym import numpy as np from gym import spaces +from gym.core import ObsType from gym.utils import seeding from alr_envs.alr.classic_control.utils import intersect @@ -14,8 +15,7 @@ class BaseReacherEnv(gym.Env, ABC): Base class for all reaching environments. """ - def __init__(self, n_links: int, random_start: bool = True, - allow_self_collision: bool = False): + def __init__(self, n_links: int, random_start: bool = True, allow_self_collision: bool = False): super().__init__() self.link_lengths = np.ones(n_links) self.n_links = n_links @@ -70,7 +70,8 @@ class BaseReacherEnv(gym.Env, ABC): def current_vel(self): return self._angle_velocity.copy() - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, + options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: # Sample only orientation of first link, i.e. the arm is always straight. if self.random_start: first_joint = self.np_random.uniform(np.pi / 4, 3 * np.pi / 4) diff --git a/alr_envs/alr/classic_control/hole_reacher/hole_reacher.py b/alr_envs/alr/classic_control/hole_reacher/hole_reacher.py index 883be8c..8f0122f 100644 --- a/alr_envs/alr/classic_control/hole_reacher/hole_reacher.py +++ b/alr_envs/alr/classic_control/hole_reacher/hole_reacher.py @@ -1,8 +1,9 @@ -from typing import Union +from typing import Union, Optional, Tuple import gym import matplotlib.pyplot as plt import numpy as np +from gym.core import ObsType from matplotlib import patches from alr_envs.alr.classic_control.base_reacher.base_reacher_direct import BaseReacherDirectEnv @@ -51,7 +52,8 @@ class HoleReacherEnv(BaseReacherDirectEnv): else: raise ValueError("Unknown reward function {}".format(rew_fct)) - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, + options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: self._generate_hole() self._set_patches() self.reward_function.reset() @@ -223,6 +225,7 @@ class HoleReacherEnv(BaseReacherDirectEnv): if __name__ == "__main__": import time + env = HoleReacherEnv(5) env.reset() diff --git a/alr_envs/alr/classic_control/hole_reacher/mp_wrapper.py b/alr_envs/alr/classic_control/hole_reacher/mp_wrapper.py index 0a5eaa6..00a5eb8 100644 --- a/alr_envs/alr/classic_control/hole_reacher/mp_wrapper.py +++ b/alr_envs/alr/classic_control/hole_reacher/mp_wrapper.py @@ -7,6 +7,7 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): + @property def context_mask(self): return np.hstack([ [self.env.random_start] * self.env.n_links, # cos @@ -25,7 +26,3 @@ class MPWrapper(RawInterfaceWrapper): @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: return self.env.current_vel - - @property - def dt(self) -> Union[float, int]: - return self.env.dt diff --git a/alr_envs/alr/classic_control/simple_reacher/mp_wrapper.py b/alr_envs/alr/classic_control/simple_reacher/mp_wrapper.py index c5ef66f..3fa12d8 100644 --- a/alr_envs/alr/classic_control/simple_reacher/mp_wrapper.py +++ b/alr_envs/alr/classic_control/simple_reacher/mp_wrapper.py @@ -7,6 +7,7 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): + @property def context_mask(self): return np.hstack([ [self.env.random_start] * self.env.n_links, # cos @@ -23,7 +24,3 @@ class MPWrapper(RawInterfaceWrapper): @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: return self.env.current_vel - - @property - def dt(self) -> Union[float, int]: - return self.env.dt diff --git a/alr_envs/alr/classic_control/simple_reacher/simple_reacher.py b/alr_envs/alr/classic_control/simple_reacher/simple_reacher.py index dac06a3..eb079d0 100644 --- a/alr_envs/alr/classic_control/simple_reacher/simple_reacher.py +++ b/alr_envs/alr/classic_control/simple_reacher/simple_reacher.py @@ -1,8 +1,9 @@ -from typing import Iterable, Union +from typing import Iterable, Union, Optional, Tuple import matplotlib.pyplot as plt import numpy as np from gym import spaces +from gym.core import ObsType from alr_envs.alr.classic_control.base_reacher.base_reacher_torque import BaseReacherTorqueEnv @@ -15,7 +16,7 @@ class SimpleReacherEnv(BaseReacherTorqueEnv): """ def __init__(self, n_links: int, target: Union[None, Iterable] = None, random_start: bool = True, - allow_self_collision: bool = False,): + allow_self_collision: bool = False, ): super().__init__(n_links, random_start, allow_self_collision) # provided initial parameters @@ -41,7 +42,8 @@ class SimpleReacherEnv(BaseReacherTorqueEnv): # def start_pos(self): # return self._start_pos - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, + options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: self._generate_goal() return super().reset() diff --git a/alr_envs/alr/classic_control/viapoint_reacher/mp_wrapper.py b/alr_envs/alr/classic_control/viapoint_reacher/mp_wrapper.py index 2b210de..d152f3b 100644 --- a/alr_envs/alr/classic_control/viapoint_reacher/mp_wrapper.py +++ b/alr_envs/alr/classic_control/viapoint_reacher/mp_wrapper.py @@ -7,6 +7,7 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): + @property def context_mask(self): return np.hstack([ [self.env.random_start] * self.env.n_links, # cos @@ -24,7 +25,3 @@ class MPWrapper(RawInterfaceWrapper): @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: return self.env.current_vel - - @property - def dt(self) -> Union[float, int]: - return self.env.dt diff --git a/alr_envs/alr/classic_control/viapoint_reacher/viapoint_reacher.py b/alr_envs/alr/classic_control/viapoint_reacher/viapoint_reacher.py index 292e40a..569ca2c 100644 --- a/alr_envs/alr/classic_control/viapoint_reacher/viapoint_reacher.py +++ b/alr_envs/alr/classic_control/viapoint_reacher/viapoint_reacher.py @@ -1,8 +1,9 @@ -from typing import Iterable, Union +from typing import Iterable, Union, Tuple, Optional import gym import matplotlib.pyplot as plt import numpy as np +from gym.core import ObsType from gym.utils import seeding from alr_envs.alr.classic_control.base_reacher.base_reacher_direct import BaseReacherDirectEnv @@ -40,7 +41,8 @@ class ViaPointReacherEnv(BaseReacherDirectEnv): # def start_pos(self): # return self._start_pos - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, + options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: self._generate_goal() return super().reset() @@ -183,8 +185,10 @@ class ViaPointReacherEnv(BaseReacherDirectEnv): plt.pause(0.01) + if __name__ == "__main__": import time + env = ViaPointReacherEnv(5) env.reset() diff --git a/alr_envs/alr/mujoco/__init__.py b/alr_envs/alr/mujoco/__init__.py index f880578..906a9a5 100644 --- a/alr_envs/alr/mujoco/__init__.py +++ b/alr_envs/alr/mujoco/__init__.py @@ -1,4 +1,4 @@ -from .ant_jump.ant_jump import ALRAntJumpEnv +from .ant_jump.ant_jump import AntJumpEnv from .ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv from .ball_in_a_cup.biac_pd import ALRBallInACupPDEnv from alr_envs.alr.mujoco.beerpong.beerpong import BeerPongEnv diff --git a/alr_envs/alr/mujoco/ant_jump/__init__.py b/alr_envs/alr/mujoco/ant_jump/__init__.py index 8a04a02..c5e6d2f 100644 --- a/alr_envs/alr/mujoco/ant_jump/__init__.py +++ b/alr_envs/alr/mujoco/ant_jump/__init__.py @@ -1 +1 @@ -from .new_mp_wrapper import MPWrapper +from .mp_wrapper import MPWrapper diff --git a/alr_envs/alr/mujoco/ant_jump/ant_jump.py b/alr_envs/alr/mujoco/ant_jump/ant_jump.py index 27098ac..deffcfa 100644 --- a/alr_envs/alr/mujoco/ant_jump/ant_jump.py +++ b/alr_envs/alr/mujoco/ant_jump/ant_jump.py @@ -1,14 +1,19 @@ +from typing import Tuple, Union, Optional + import numpy as np +from gym.core import ObsType from gym.envs.mujoco.ant_v3 import AntEnv MAX_EPISODE_STEPS_ANTJUMP = 200 -# TODO: This environment was not testet yet. Do the following todos and test it. + + +# TODO: This environment was not tested yet. Do the following todos and test it. # TODO: Right now this environment only considers jumping to a specific height, which is not nice. It should be extended # to the same structure as the Hopper, where the angles are randomized (->contexts) and the agent should jump as heigh # as possible, while landing at a specific target position -class ALRAntJumpEnv(AntEnv): +class AntJumpEnv(AntEnv): """ Initialization changes to normal Ant: - healthy_reward: 1.0 -> 0.01 -> 0.0 no healthy reward needed - Paul and Marc @@ -27,17 +32,15 @@ class ALRAntJumpEnv(AntEnv): contact_force_range=(-1.0, 1.0), reset_noise_scale=0.1, exclude_current_positions_from_observation=True, - max_episode_steps=200): + ): self.current_step = 0 self.max_height = 0 - self.max_episode_steps = max_episode_steps self.goal = 0 super().__init__(xml_file, ctrl_cost_weight, contact_cost_weight, healthy_reward, terminate_when_unhealthy, healthy_z_range, contact_force_range, reset_noise_scale, exclude_current_positions_from_observation) def step(self, action): - self.current_step += 1 self.do_simulation(action, self.frame_skip) @@ -52,12 +55,12 @@ class ALRAntJumpEnv(AntEnv): costs = ctrl_cost + contact_cost - done = height < 0.3 # fall over -> is the 0.3 value from healthy_z_range? TODO change 0.3 to the value of healthy z angle + done = height < 0.3 # fall over -> is the 0.3 value from healthy_z_range? TODO change 0.3 to the value of healthy z angle - if self.current_step == self.max_episode_steps or done: + if self.current_step == MAX_EPISODE_STEPS_ANTJUMP or done: # -10 for scaling the value of the distance between the max_height and the goal height; only used when context is enabled # height_reward = -10 * (np.linalg.norm(self.max_height - self.goal)) - height_reward = -10*np.linalg.norm(self.max_height - self.goal) + height_reward = -10 * np.linalg.norm(self.max_height - self.goal) # no healthy reward when using context, because we optimize a negative value healthy_reward = 0 @@ -77,7 +80,8 @@ class ALRAntJumpEnv(AntEnv): def _get_obs(self): return np.append(super()._get_obs(), self.goal) - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, + options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: self.current_step = 0 self.max_height = 0 self.goal = np.random.uniform(1.0, 2.5, @@ -96,19 +100,3 @@ class ALRAntJumpEnv(AntEnv): observation = self._get_obs() return observation - -if __name__ == '__main__': - render_mode = "human" # "human" or "partial" or "final" - env = ALRAntJumpEnv() - obs = env.reset() - - for i in range(2000): - # test with random actions - ac = env.action_space.sample() - obs, rew, d, info = env.step(ac) - if i % 10 == 0: - env.render(mode=render_mode) - if d: - env.reset() - - env.close() diff --git a/alr_envs/alr/mujoco/ant_jump/mp_wrapper.py b/alr_envs/alr/mujoco/ant_jump/mp_wrapper.py index f6e99a3..a481d6f 100644 --- a/alr_envs/alr/mujoco/ant_jump/mp_wrapper.py +++ b/alr_envs/alr/mujoco/ant_jump/mp_wrapper.py @@ -1,4 +1,4 @@ -from typing import Tuple, Union +from typing import Union, Tuple import numpy as np @@ -8,10 +8,10 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): @property - def context_mask(self) -> np.ndarray: + def context_mask(self): return np.hstack([ - [False] * 111, # ant has 111 dimensional observation space !! - [True] # goal height + [False] * 111, # ant has 111 dimensional observation space !! + [True] # goal height ]) @property @@ -21,11 +21,3 @@ class MPWrapper(RawInterfaceWrapper): @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: return self.env.sim.data.qvel[6:14].copy() - - @property - def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: - raise ValueError("Goal position is not available and has to be learnt based on the environment.") - - @property - def dt(self) -> Union[float, int]: - return self.env.dt diff --git a/alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py b/alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py deleted file mode 100644 index f6f026b..0000000 --- a/alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py +++ /dev/null @@ -1,22 +0,0 @@ -from alr_envs.black_box.black_box_wrapper import BlackBoxWrapper -from typing import Union, Tuple -import numpy as np - -from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper - - -class MPWrapper(RawInterfaceWrapper): - - def get_context_mask(self): - return np.hstack([ - [False] * 111, # ant has 111 dimensional observation space !! - [True] # goal height - ]) - - @property - def current_pos(self) -> Union[float, int, np.ndarray]: - return self.env.sim.data.qpos[7:15].copy() - - @property - def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel[6:14].copy() diff --git a/alr_envs/alr/mujoco/alr_reward_fct.py b/alr_envs/alr/mujoco/ball_in_a_cup/alr_reward_fct.py similarity index 100% rename from alr_envs/alr/mujoco/alr_reward_fct.py rename to alr_envs/alr/mujoco/ball_in_a_cup/alr_reward_fct.py diff --git a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py index 4ea4381..b4d0e6e 100644 --- a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py +++ b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py @@ -1,5 +1,5 @@ import numpy as np -from alr_envs.alr.mujoco import alr_reward_fct +from alr_envs.alr.mujoco.ball_in_a_cup import alr_reward_fct class BallInACupReward(alr_reward_fct.AlrReward): diff --git a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py index 0762e22..8044eb8 100644 --- a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py +++ b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py @@ -1,5 +1,5 @@ import numpy as np -from alr_envs.alr.mujoco import alr_reward_fct +from alr_envs.alr.mujoco.ball_in_a_cup import alr_reward_fct class BallInACupReward(alr_reward_fct.AlrReward): diff --git a/alr_envs/alr/mujoco/ball_in_a_cup/biac_pd.py b/alr_envs/alr/mujoco/ball_in_a_cup/biac_pd.py index b047c54..4abfb6c 100644 --- a/alr_envs/alr/mujoco/ball_in_a_cup/biac_pd.py +++ b/alr_envs/alr/mujoco/ball_in_a_cup/biac_pd.py @@ -6,17 +6,6 @@ import mujoco_py.builder import numpy as np from gym import utils -from mp_env_api.mp_wrappers.detpmp_wrapper import DetPMPWrapper -from mp_env_api.utils.policies import PDControllerExtend - - -def make_detpmp_env(**kwargs): - name = kwargs.pop("name") - _env = gym.make(name) - policy = PDControllerExtend(_env, p_gains=kwargs.pop('p_gains'), d_gains=kwargs.pop('d_gains')) - kwargs['policy_type'] = policy - return DetPMPWrapper(_env, **kwargs) - class ALRBallInACupPDEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self, frame_skip=4, apply_gravity_comp=True, simplified: bool = False, diff --git a/alr_envs/alr/mujoco/beerpong/__init__.py b/alr_envs/alr/mujoco/beerpong/__init__.py index 8a04a02..c5e6d2f 100644 --- a/alr_envs/alr/mujoco/beerpong/__init__.py +++ b/alr_envs/alr/mujoco/beerpong/__init__.py @@ -1 +1 @@ -from .new_mp_wrapper import MPWrapper +from .mp_wrapper import MPWrapper diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py index 9adab60..fd820eb 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong.py @@ -1,12 +1,11 @@ import os +from typing import Optional import mujoco_py.builder import numpy as np from gym import utils from gym.envs.mujoco import MujocoEnv -from alr_envs.alr.mujoco.beerpong.deprecated.beerpong_reward_staged import BeerPongReward - # XML Variables ROBOT_COLLISION_OBJ = ["wrist_palm_link_convex_geom", "wrist_pitch_link_convex_decomposition_p1_geom", @@ -76,7 +75,7 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): def start_vel(self): return self._start_vel - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None): self.dists = [] self.dists_final = [] self.action_costs = [] diff --git a/alr_envs/alr/mujoco/beerpong/mp_wrapper.py b/alr_envs/alr/mujoco/beerpong/mp_wrapper.py index 20e7532..e69d4f9 100644 --- a/alr_envs/alr/mujoco/beerpong/mp_wrapper.py +++ b/alr_envs/alr/mujoco/beerpong/mp_wrapper.py @@ -1,4 +1,4 @@ -from typing import Tuple, Union +from typing import Union, Tuple import numpy as np @@ -7,34 +7,36 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): - @property - def context_mask(self) -> np.ndarray: + def get_context_mask(self): return np.hstack([ [False] * 7, # cos [False] * 7, # sin - [False] * 7, # joint velocities - [False] * 3, # cup_goal_diff_final - [False] * 3, # cup_goal_diff_top + [False] * 7, # joint velocities + [False] * 3, # cup_goal_diff_final + [False] * 3, # cup_goal_diff_top [True] * 2, # xy position of cup [False] # env steps ]) - @property - def start_pos(self): - return self._start_pos - @property def current_pos(self) -> Union[float, int, np.ndarray, Tuple]: - return self.sim.data.qpos[0:7].copy() + return self.env.sim.data.qpos[0:7].copy() @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.sim.data.qvel[0:7].copy() + return self.env.sim.data.qvel[0:7].copy() - @property - def goal_pos(self): - raise ValueError("Goal position is not available and has to be learnt based on the environment.") + # TODO: Fix this + def _episode_callback(self, action: np.ndarray, mp) -> Tuple[np.ndarray, Union[np.ndarray, None]]: + if mp.learn_tau: + self.env.env.release_step = action[0] / self.env.dt # Tau value + return action, None + else: + return action, None - @property - def dt(self) -> Union[float, int]: - return self.env.dt + def set_context(self, context): + xyz = np.zeros(3) + xyz[:2] = context + xyz[-1] = 0.840 + self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz + return self.get_observation_from_step(self.env.env._get_obs()) diff --git a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py deleted file mode 100644 index e447a49..0000000 --- a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py +++ /dev/null @@ -1,41 +0,0 @@ -from typing import Union, Tuple - -import numpy as np - -from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper - - -class MPWrapper(RawInterfaceWrapper): - @property - def current_pos(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qpos[0:7].copy() - - @property - def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel[0:7].copy() - - def get_context_mask(self): - return np.hstack([ - [False] * 7, # cos - [False] * 7, # sin - [False] * 7, # joint velocities - [False] * 3, # cup_goal_diff_final - [False] * 3, # cup_goal_diff_top - [True] * 2, # xy position of cup - [False] # env steps - ]) - - # TODO: Fix this - def _episode_callback(self, action: np.ndarray, mp) -> Tuple[np.ndarray, Union[np.ndarray, None]]: - if mp.learn_tau: - self.env.env.release_step = action[0] / self.env.dt # Tau value - return action, None - else: - return action, None - - def set_context(self, context): - xyz = np.zeros(3) - xyz[:2] = context - xyz[-1] = 0.840 - self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz - return self.get_observation_from_step(self.env.env._get_obs()) diff --git a/alr_envs/alr/mujoco/half_cheetah_jump/__init__.py b/alr_envs/alr/mujoco/half_cheetah_jump/__init__.py index 8a04a02..c5e6d2f 100644 --- a/alr_envs/alr/mujoco/half_cheetah_jump/__init__.py +++ b/alr_envs/alr/mujoco/half_cheetah_jump/__init__.py @@ -1 +1 @@ -from .new_mp_wrapper import MPWrapper +from .mp_wrapper import MPWrapper diff --git a/alr_envs/alr/mujoco/half_cheetah_jump/half_cheetah_jump.py b/alr_envs/alr/mujoco/half_cheetah_jump/half_cheetah_jump.py index 4b53a5d..a90edf6 100644 --- a/alr_envs/alr/mujoco/half_cheetah_jump/half_cheetah_jump.py +++ b/alr_envs/alr/mujoco/half_cheetah_jump/half_cheetah_jump.py @@ -1,4 +1,7 @@ import os +from typing import Tuple, Union, Optional + +from gym.core import ObsType from gym.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv import numpy as np @@ -20,7 +23,7 @@ class ALRHalfCheetahJumpEnv(HalfCheetahEnv): max_episode_steps=100): self.current_step = 0 self.max_height = 0 - self.max_episode_steps = max_episode_steps + # self.max_episode_steps = max_episode_steps self.goal = 0 self.context = context xml_file = os.path.join(os.path.dirname(__file__), "assets", xml_file) @@ -37,15 +40,15 @@ class ALRHalfCheetahJumpEnv(HalfCheetahEnv): ## Didnt use fell_over, because base env also has no done condition - Paul and Marc # fell_over = abs(self.sim.data.qpos[2]) > 2.5 # how to figure out if the cheetah fell over? -> 2.5 oke? - # TODO: Should a fall over be checked herE? + # TODO: Should a fall over be checked here? done = False ctrl_cost = self.control_cost(action) costs = ctrl_cost - if self.current_step == self.max_episode_steps: - height_goal_distance = -10*np.linalg.norm(self.max_height - self.goal) + 1e-8 if self.context \ - else self.max_height + if self.current_step == MAX_EPISODE_STEPS_HALFCHEETAHJUMP: + height_goal_distance = -10 * np.linalg.norm(self.max_height - self.goal) + 1e-8 if self.context \ + else self.max_height rewards = self._forward_reward_weight * height_goal_distance else: rewards = 0 @@ -62,7 +65,8 @@ class ALRHalfCheetahJumpEnv(HalfCheetahEnv): def _get_obs(self): return np.append(super()._get_obs(), self.goal) - def reset(self): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, + options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: self.max_height = 0 self.current_step = 0 self.goal = np.random.uniform(1.1, 1.6, 1) # 1.1 1.6 @@ -80,21 +84,3 @@ class ALRHalfCheetahJumpEnv(HalfCheetahEnv): observation = self._get_obs() return observation - -if __name__ == '__main__': - render_mode = "human" # "human" or "partial" or "final" - env = ALRHalfCheetahJumpEnv() - obs = env.reset() - - for i in range(2000): - # objective.load_result("/tmp/cma") - # test with random actions - ac = env.action_space.sample() - obs, rew, d, info = env.step(ac) - if i % 10 == 0: - env.render(mode=render_mode) - if d: - print('After ', i, ' steps, done: ', d) - env.reset() - - env.close() \ No newline at end of file diff --git a/alr_envs/alr/mujoco/half_cheetah_jump/mp_wrapper.py b/alr_envs/alr/mujoco/half_cheetah_jump/mp_wrapper.py index 930da6d..298bf53 100644 --- a/alr_envs/alr/mujoco/half_cheetah_jump/mp_wrapper.py +++ b/alr_envs/alr/mujoco/half_cheetah_jump/mp_wrapper.py @@ -10,7 +10,7 @@ class MPWrapper(RawInterfaceWrapper): def context_mask(self) -> np.ndarray: return np.hstack([ [False] * 17, - [True] # goal height + [True] # goal height ]) @property @@ -20,11 +20,3 @@ class MPWrapper(RawInterfaceWrapper): @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: return self.env.sim.data.qvel[3:9].copy() - - @property - def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: - raise ValueError("Goal position is not available and has to be learnt based on the environment.") - - @property - def dt(self) -> Union[float, int]: - return self.env.dt diff --git a/alr_envs/alr/mujoco/half_cheetah_jump/new_mp_wrapper.py b/alr_envs/alr/mujoco/half_cheetah_jump/new_mp_wrapper.py deleted file mode 100644 index 9a65952..0000000 --- a/alr_envs/alr/mujoco/half_cheetah_jump/new_mp_wrapper.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import Tuple, Union - -import numpy as np - -from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper - - -class MPWrapper(RawInterfaceWrapper): - def context_mask(self): - return np.hstack([ - [False] * 17, - [True] # goal height - ]) - - @property - def current_pos(self) -> Union[float, int, np.ndarray]: - return self.env.sim.data.qpos[3:9].copy() - - @property - def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel[3:9].copy() - diff --git a/alr_envs/alr/mujoco/hopper_jump/__init__.py b/alr_envs/alr/mujoco/hopper_jump/__init__.py index c5e6d2f..e901144 100644 --- a/alr_envs/alr/mujoco/hopper_jump/__init__.py +++ b/alr_envs/alr/mujoco/hopper_jump/__init__.py @@ -1 +1,2 @@ from .mp_wrapper import MPWrapper + diff --git a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py index 78b06d3..7409300 100644 --- a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py +++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py @@ -1,3 +1,4 @@ +import copy from typing import Optional from gym.envs.mujoco.hopper_v3 import HopperEnv @@ -7,10 +8,11 @@ import os MAX_EPISODE_STEPS_HOPPERJUMP = 250 -class ALRHopperJumpEnv(HopperEnv): +class HopperJumpEnv(HopperEnv): """ Initialization changes to normal Hopper: - terminate_when_unhealthy: True -> False + - healthy_reward: 1.0 -> 2.0 - healthy_z_range: (0.7, float('inf')) -> (0.5, float('inf')) - healthy_angle_range: (-0.2, 0.2) -> (-float('inf'), float('inf')) - exclude_current_positions_from_observation: True -> False @@ -21,24 +23,28 @@ class ALRHopperJumpEnv(HopperEnv): xml_file='hopper_jump.xml', forward_reward_weight=1.0, ctrl_cost_weight=1e-3, - healthy_reward=1.0, - penalty=0.0, + healthy_reward=2.0, # 1 step + contact_weight=2.0, # 0 step + height_weight=10.0, # 3 step + dist_weight=3.0, # 3 step terminate_when_unhealthy=False, healthy_state_range=(-100.0, 100.0), healthy_z_range=(0.5, float('inf')), healthy_angle_range=(-float('inf'), float('inf')), reset_noise_scale=5e-3, exclude_current_positions_from_observation=False, + sparse=False, ): - self._steps = 0 + self.sparse = sparse + self._height_weight = height_weight + self._dist_weight = dist_weight + self._contact_weight = contact_weight + self.max_height = 0 - # self.penalty = penalty self.goal = 0 - self._floor_geom_id = None - self._foot_geom_id = None - + self._steps = 0 self.contact_with_floor = False self.init_floor_contact = False self.has_left_floor = False @@ -49,12 +55,12 @@ class ALRHopperJumpEnv(HopperEnv): healthy_state_range, healthy_z_range, healthy_angle_range, reset_noise_scale, exclude_current_positions_from_observation) + # increase initial height + self.init_qpos[1] = 1.5 + def step(self, action): self._steps += 1 - self._floor_geom_id = self.model.geom_name2id('floor') - self._foot_geom_id = self.model.geom_name2id('foot_geom') - self.do_simulation(action, self.frame_skip) height_after = self.get_body_com("torso")[2] @@ -73,18 +79,19 @@ class ALRHopperJumpEnv(HopperEnv): ctrl_cost = self.control_cost(action) costs = ctrl_cost done = False - goal_dist = np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0])) + goal_dist = np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0])) if self.contact_dist is None and self.contact_with_floor: self.contact_dist = goal_dist rewards = 0 - if self._steps >= MAX_EPISODE_STEPS_HOPPERJUMP: - # healthy_reward = 0 if self.context else self.healthy_reward * self._steps - healthy_reward = self.healthy_reward * 2 # * self._steps - contact_dist = self.contact_dist if self.contact_dist is not None else 5 - dist_reward = self._forward_reward_weight * (-3 * goal_dist + 10 * self.max_height - 2 * contact_dist) - rewards = dist_reward + healthy_reward + if not self.sparse or (self.sparse and self._steps >= MAX_EPISODE_STEPS_HOPPERJUMP): + healthy_reward = self.healthy_reward + distance_reward = goal_dist * self._dist_weight + height_reward = (self.max_height if self.sparse else self.get_body_com("torso")[2]) * self._height_weight + contact_reward = (self.contact_dist or 5) * self._contact_weight + # dist_reward = self._forward_reward_weight * (-3 * goal_dist + 10 * self.max_height - 2 * contact_dist) + rewards = self._forward_reward_weight * (distance_reward + height_reward + contact_reward + healthy_reward) observation = self._get_obs() reward = rewards - costs @@ -97,24 +104,40 @@ class ALRHopperJumpEnv(HopperEnv): height_rew=self.max_height, healthy_reward=self.healthy_reward * 2, healthy=self.is_healthy, - contact_dist=self.contact_dist if self.contact_dist is not None else 0 + contact_dist=self.contact_dist or 0 ) return observation, reward, done, info def _get_obs(self): - return np.append(super()._get_obs(), self.goal) + goal_dist = self.data.get_site_xpos('foot_site') - np.array([self.goal, 0, 0]) + return np.concatenate((super(HopperJumpEnv, self)._get_obs(), goal_dist.copy(), self.goal.copy())) - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None, ): - self.goal = self.np_random.uniform(1.4, 2.16, 1)[0] # 1.3 2.3 + def reset_model(self): + super(HopperJumpEnv, self).reset_model() + + self.goal = self.np_random.uniform(0.3, 1.35, 1)[0] + self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0]) self.max_height = 0 self._steps = 0 - return super().reset() - # overwrite reset_model to make it deterministic - def reset_model(self): + noise_low = -np.zeros(self.model.nq) + noise_low[3] = -0.5 + noise_low[4] = -0.2 + noise_low[5] = 0 - qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) - qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + noise_high = np.zeros(self.model.nq) + noise_high[3] = 0 + noise_high[4] = 0 + noise_high[5] = 0.785 + + qpos = ( + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + + self.init_qpos + ) + qvel = ( + # self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + + self.init_qvel + ) self.set_state(qpos, qvel) @@ -123,6 +146,7 @@ class ALRHopperJumpEnv(HopperEnv): self.contact_with_floor = False self.init_floor_contact = False self.contact_dist = None + return observation def _is_floor_foot_contact(self): @@ -137,184 +161,75 @@ class ALRHopperJumpEnv(HopperEnv): return False -class ALRHopperXYJumpEnv(ALRHopperJumpEnv): +class HopperJumpStepEnv(HopperJumpEnv): + + def __init__(self, + xml_file='hopper_jump.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=1.0, + height_weight=3, + dist_weight=3, + terminate_when_unhealthy=False, + healthy_state_range=(-100.0, 100.0), + healthy_z_range=(0.5, float('inf')), + healthy_angle_range=(-float('inf'), float('inf')), + reset_noise_scale=5e-3, + exclude_current_positions_from_observation=False + ): + + self._height_weight = height_weight + self._dist_weight = dist_weight + super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, terminate_when_unhealthy, + healthy_state_range, healthy_z_range, healthy_angle_range, reset_noise_scale, + exclude_current_positions_from_observation) def step(self, action): - self._floor_geom_id = self.model.geom_name2id('floor') - self._foot_geom_id = self.model.geom_name2id('foot_geom') - self._steps += 1 + self.do_simulation(action, self.frame_skip) + height_after = self.get_body_com("torso")[2] - site_pos_after = self.sim.data.site_xpos[self.model.site_name2id('foot_site')].copy() + site_pos_after = self.data.get_site_xpos('foot_site') self.max_height = max(height_after, self.max_height) - # floor_contact = self._contact_checker(self._floor_geom_id, self._foot_geom_id) if not self.contact_with_floor else False - # self.init_floor_contact = floor_contact if not self.init_floor_contact else self.init_floor_contact - # self.has_left_floor = not floor_contact if self.init_floor_contact and not self.has_left_floor else self.has_left_floor - # self.contact_with_floor = floor_contact if not self.contact_with_floor and self.has_left_floor else self.contact_with_floor - - floor_contact = self._is_floor_foot_contact(self._floor_geom_id, - self._foot_geom_id) if not self.contact_with_floor else False - if not self.init_floor_contact: - self.init_floor_contact = floor_contact - if self.init_floor_contact and not self.has_left_floor: - self.has_left_floor = not floor_contact - if not self.contact_with_floor and self.has_left_floor: - self.contact_with_floor = floor_contact - - if self.contact_dist is None and self.contact_with_floor: - self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')] - - np.array([self.goal, 0, 0])) - ctrl_cost = self.control_cost(action) + healthy_reward = self.healthy_reward + height_reward = self._height_weight * height_after + goal_dist = np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0])) + goal_dist_reward = -self._dist_weight * goal_dist + dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward) + + rewards = dist_reward + healthy_reward costs = ctrl_cost done = False - goal_dist = np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0])) - rewards = 0 - if self._steps >= self.max_episode_steps: - # healthy_reward = 0 if self.context else self.healthy_reward * self._steps - healthy_reward = self.healthy_reward * 2 # * self._steps - contact_dist = self.contact_dist if self.contact_dist is not None else 5 - dist_reward = self._forward_reward_weight * (-3 * goal_dist + 10 * self.max_height - 2 * contact_dist) - rewards = dist_reward + healthy_reward + + # This is only for logging the distance to goal when first having the contact + has_floor_contact = self._is_floor_foot_contact() if not self.contact_with_floor else False + + if not self.init_floor_contact: + self.init_floor_contact = has_floor_contact + if self.init_floor_contact and not self.has_left_floor: + self.has_left_floor = not has_floor_contact + if not self.contact_with_floor and self.has_left_floor: + self.contact_with_floor = has_floor_contact + + if self.contact_dist is None and self.contact_with_floor: + self.contact_dist = goal_dist + + ############################################################## observation = self._get_obs() reward = rewards - costs info = { 'height': height_after, 'x_pos': site_pos_after, - 'max_height': self.max_height, - 'goal': self.goal, + 'max_height': copy.copy(self.max_height), + 'goal': copy.copy(self.goal), 'goal_dist': goal_dist, - 'height_rew': self.max_height, - 'healthy_reward': self.healthy_reward * 2, - 'healthy': self.is_healthy, - 'contact_dist': self.contact_dist if self.contact_dist is not None else 0 - } - return observation, reward, done, info - - def reset_model(self): - self.init_qpos[1] = 1.5 - self._floor_geom_id = self.model.geom_name2id('floor') - self._foot_geom_id = self.model.geom_name2id('foot_geom') - noise_low = -np.zeros(self.model.nq) - noise_low[3] = -0.5 - noise_low[4] = -0.2 - noise_low[5] = 0 - - noise_high = np.zeros(self.model.nq) - noise_high[3] = 0 - noise_high[4] = 0 - noise_high[5] = 0.785 - - rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) - qpos = self.init_qpos + rnd_vec - qvel = self.init_qvel - self.set_state(qpos, qvel) - - observation = self._get_obs() - self.has_left_floor = False - self.contact_with_floor = False - self.init_floor_contact = False - self.contact_dist = None - - return observation - - def reset(self): - super().reset() - self.goal = self.np_random.uniform(0.3, 1.35, 1)[0] - self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0]) - return self.reset_model() - - def _get_obs(self): - goal_diff = self.sim.data.site_xpos[self.model.site_name2id('foot_site')].copy() \ - - np.array([self.goal, 0, 0]) - return np.concatenate((super(ALRHopperXYJumpEnv, self)._get_obs(), goal_diff)) - - def set_context(self, context): - # context is 4 dimensional - qpos = self.init_qpos - qvel = self.init_qvel - qpos[-3:] = context[:3] - self.goal = context[-1] - self.set_state(qpos, qvel) - self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0]) - return self._get_obs() - - -class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): - - def __init__( - self, - xml_file='hopper_jump.xml', - forward_reward_weight=1.0, - ctrl_cost_weight=1e-3, - healthy_reward=0.0, - penalty=0.0, - context=True, - terminate_when_unhealthy=False, - healthy_state_range=(-100.0, 100.0), - healthy_z_range=(0.5, float('inf')), - healthy_angle_range=(-float('inf'), float('inf')), - reset_noise_scale=5e-3, - exclude_current_positions_from_observation=False, - max_episode_steps=250, - height_scale=10, - dist_scale=3, - healthy_scale=2 - ): - self.height_scale = height_scale - self.dist_scale = dist_scale - self.healthy_scale = healthy_scale - super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, penalty, context, - terminate_when_unhealthy, healthy_state_range, healthy_z_range, healthy_angle_range, - reset_noise_scale, exclude_current_positions_from_observation, max_episode_steps) - - def step(self, action): - self._floor_geom_id = self.model.geom_name2id('floor') - self._foot_geom_id = self.model.geom_name2id('foot_geom') - - self._steps += 1 - self.do_simulation(action, self.frame_skip) - height_after = self.get_body_com("torso")[2] - site_pos_after = self.sim.data.site_xpos[self.model.site_name2id('foot_site')].copy() - self.max_height = max(height_after, self.max_height) - ctrl_cost = self.control_cost(action) - - healthy_reward = self.healthy_reward * self.healthy_scale - height_reward = self.height_scale * height_after - goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0] - goal_dist_reward = -self.dist_scale * goal_dist - dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward) - reward = -ctrl_cost + healthy_reward + dist_reward - done = False - observation = self._get_obs() - - ########################################################### - # This is only for logging the distance to goal when first having the contact - ########################################################## - floor_contact = self._is_floor_foot_contact(self._floor_geom_id, - self._foot_geom_id) if not self.contact_with_floor else False - if not self.init_floor_contact: - self.init_floor_contact = floor_contact - if self.init_floor_contact and not self.has_left_floor: - self.has_left_floor = not floor_contact - if not self.contact_with_floor and self.has_left_floor: - self.contact_with_floor = floor_contact - - if self.contact_dist is None and self.contact_with_floor: - self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')] - - np.array([self.goal, 0, 0])) - info = { - 'height': height_after, - 'x_pos': site_pos_after, - 'max_height': self.max_height, - 'goal': self.goal, - 'goal_dist': goal_dist, - 'height_rew': self.max_height, - 'healthy_reward': self.healthy_reward * self.healthy_reward, - 'healthy': self.is_healthy, - 'contact_dist': self.contact_dist if self.contact_dist is not None else 0 + 'height_rew': height_reward, + 'healthy_reward': healthy_reward, + 'healthy': copy.copy(self.is_healthy), + 'contact_dist': copy.copy(self.contact_dist) or 0 } return observation, reward, done, info diff --git a/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py b/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py index c7b16db..394893e 100644 --- a/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py +++ b/alr_envs/alr/mujoco/hopper_jump/mp_wrapper.py @@ -8,6 +8,7 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): # Random x goal + random init pos + @property def context_mask(self): return np.hstack([ [False] * (2 + int(not self.exclude_current_positions_from_observation)), # position diff --git a/alr_envs/alr/mujoco/hopper_throw/mp_wrapper.py b/alr_envs/alr/mujoco/hopper_throw/mp_wrapper.py index 7778e8c..78c2f51 100644 --- a/alr_envs/alr/mujoco/hopper_throw/mp_wrapper.py +++ b/alr_envs/alr/mujoco/hopper_throw/mp_wrapper.py @@ -6,8 +6,9 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): + @property - def context_mask(self) -> np.ndarray: + def context_mask(self): return np.hstack([ [False] * 17, [True] # goal pos @@ -15,16 +16,8 @@ class MPWrapper(RawInterfaceWrapper): @property def current_pos(self) -> Union[float, int, np.ndarray]: - return self.env.sim.data.qpos[3:6].copy() + return self.env.data.qpos[3:6].copy() @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel[3:6].copy() - - @property - def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: - raise ValueError("Goal position is not available and has to be learnt based on the environment.") - - @property - def dt(self) -> Union[float, int]: - return self.env.dt + return self.env.data.qvel[3:6].copy() diff --git a/alr_envs/alr/mujoco/hopper_throw/new_mp_wrapper.py b/alr_envs/alr/mujoco/hopper_throw/new_mp_wrapper.py deleted file mode 100644 index 01d87a4..0000000 --- a/alr_envs/alr/mujoco/hopper_throw/new_mp_wrapper.py +++ /dev/null @@ -1,25 +0,0 @@ -from typing import Tuple, Union - -import numpy as np - -from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper - - -class MPWrapper(RawInterfaceWrapper): - def context_mask(self): - return np.hstack([ - [False] * 17, - [True] # goal pos - ]) - - @property - def current_pos(self) -> Union[float, int, np.ndarray]: - return self.env.sim.data.qpos[3:6].copy() - - @property - def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel[3:6].copy() - - @property - def dt(self) -> Union[float, int]: - return self.env.dt diff --git a/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml b/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml index 25a3208..742f45c 100644 --- a/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml +++ b/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml @@ -41,7 +41,7 @@ - + diff --git a/alr_envs/alr/mujoco/reacher/mp_wrapper.py b/alr_envs/alr/mujoco/reacher/mp_wrapper.py index de33ae0..37422b1 100644 --- a/alr_envs/alr/mujoco/reacher/mp_wrapper.py +++ b/alr_envs/alr/mujoco/reacher/mp_wrapper.py @@ -15,14 +15,13 @@ class MPWrapper(RawInterfaceWrapper): [True] * 2, # goal position [False] * self.env.n_links, # angular velocity [False] * 3, # goal distance - # self.get_body_com("target"), # only return target to make problem harder - [False], # step + # [False], # step ]) @property def current_pos(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qpos.flat[:self.env.n_links] + return self.env.data.qpos.flat[:self.env.n_links] @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel.flat[:self.env.n_links] + return self.env.data.qvel.flat[:self.env.n_links] diff --git a/alr_envs/alr/mujoco/reacher/reacher.py b/alr_envs/alr/mujoco/reacher/reacher.py index a5d34ee..a344879 100644 --- a/alr_envs/alr/mujoco/reacher/reacher.py +++ b/alr_envs/alr/mujoco/reacher/reacher.py @@ -94,12 +94,12 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): return self._get_obs() def _get_obs(self): - theta = self.sim.data.qpos.flat[:self.n_links] + theta = self.data.qpos.flat[:self.n_links] target = self.get_body_com("target") return np.concatenate([ np.cos(theta), np.sin(theta), target[:2], # x-y of goal position - self.sim.data.qvel.flat[:self.n_links], # angular velocity + self.data.qvel.flat[:self.n_links], # angular velocity self.get_body_com("fingertip") - target, # goal distance ]) diff --git a/alr_envs/alr/mujoco/walker_2d_jump/__init__.py b/alr_envs/alr/mujoco/walker_2d_jump/__init__.py index 8a04a02..c5e6d2f 100644 --- a/alr_envs/alr/mujoco/walker_2d_jump/__init__.py +++ b/alr_envs/alr/mujoco/walker_2d_jump/__init__.py @@ -1 +1 @@ -from .new_mp_wrapper import MPWrapper +from .mp_wrapper import MPWrapper diff --git a/alr_envs/alr/mujoco/walker_2d_jump/mp_wrapper.py b/alr_envs/alr/mujoco/walker_2d_jump/mp_wrapper.py index 5e9d0eb..63432a7 100644 --- a/alr_envs/alr/mujoco/walker_2d_jump/mp_wrapper.py +++ b/alr_envs/alr/mujoco/walker_2d_jump/mp_wrapper.py @@ -6,25 +6,18 @@ from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper class MPWrapper(RawInterfaceWrapper): + @property - def context_mask(self) -> np.ndarray: + def context_mask(self): return np.hstack([ [False] * 17, - [True] # goal pos + [True] # goal pos ]) @property def current_pos(self) -> Union[float, int, np.ndarray]: - return self.env.sim.data.qpos[3:9].copy() + return self.env.data.qpos[3:9].copy() @property def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel[3:9].copy() - - @property - def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: - raise ValueError("Goal position is not available and has to be learnt based on the environment.") - - @property - def dt(self) -> Union[float, int]: - return self.env.dt + return self.env.data.qvel[3:9].copy() diff --git a/alr_envs/alr/mujoco/walker_2d_jump/new_mp_wrapper.py b/alr_envs/alr/mujoco/walker_2d_jump/new_mp_wrapper.py deleted file mode 100644 index b2bfde9..0000000 --- a/alr_envs/alr/mujoco/walker_2d_jump/new_mp_wrapper.py +++ /dev/null @@ -1,26 +0,0 @@ -from typing import Tuple, Union - -import numpy as np - -from alr_envs.black_box.raw_interface_wrapper import RawInterfaceWrapper - - -class MPWrapper(RawInterfaceWrapper): - def context_mask(self): - return np.hstack([ - [False] * 17, - [True] # goal pos - ]) - - @property - def current_pos(self) -> Union[float, int, np.ndarray]: - return self.env.sim.data.qpos[3:9].copy() - - @property - def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: - return self.env.sim.data.qvel[3:9].copy() - - @property - def goal_pos(self) -> Union[float, int, np.ndarray, Tuple]: - raise ValueError("Goal position is not available and has to be learnt based on the environment.") - diff --git a/alr_envs/black_box/black_box_wrapper.py b/alr_envs/black_box/black_box_wrapper.py index 43cc4f0..cec2943 100644 --- a/alr_envs/black_box/black_box_wrapper.py +++ b/alr_envs/black_box/black_box_wrapper.py @@ -1,4 +1,4 @@ -from typing import Tuple, Union +from typing import Tuple, Union, Optional import gym import numpy as np @@ -19,7 +19,7 @@ class BlackBoxWrapper(gym.ObservationWrapper): duration: float, verbose: int = 1, learn_sub_trajectories: bool = False, - replanning_schedule: Union[None, callable] = None, + replanning_schedule: Optional[callable] = None, reward_aggregation: callable = np.sum ): """ @@ -41,7 +41,8 @@ class BlackBoxWrapper(gym.ObservationWrapper): self.duration = duration self.learn_sub_trajectories = learn_sub_trajectories - self.replanning_schedule = replanning_schedule + self.do_replanning = replanning_schedule is not None + self.replanning_schedule = replanning_schedule or (lambda *x: False) self.current_traj_steps = 0 # trajectory generation @@ -55,12 +56,10 @@ class BlackBoxWrapper(gym.ObservationWrapper): self.reward_aggregation = reward_aggregation # spaces - self.return_context_observation = not (self.learn_sub_trajectories or replanning_schedule) - self.traj_gen_action_space = self.get_traj_gen_action_space() - self.action_space = self.get_action_space() - self.observation_space = spaces.Box(low=self.env.observation_space.low[self.env.context_mask], - high=self.env.observation_space.high[self.env.context_mask], - dtype=self.env.observation_space.dtype) + self.return_context_observation = not (learn_sub_trajectories or self.do_replanning) + self.traj_gen_action_space = self._get_traj_gen_action_space() + self.action_space = self._get_action_space() + self.observation_space = self._get_observation_space() # rendering self.render_kwargs = {} @@ -76,23 +75,24 @@ class BlackBoxWrapper(gym.ObservationWrapper): # TODO: Bruce said DMP, ProMP, ProDMP can have 0 bc_time for sequencing # TODO Check with Bruce for replanning self.traj_gen.set_boundary_conditions( - bc_time=np.zeros((1,)) if not self.replanning_schedule else self.current_traj_steps * self.dt, + bc_time=np.zeros((1,)) if not self.do_replanning else np.array([self.current_traj_steps * self.dt]), bc_pos=self.current_pos, bc_vel=self.current_vel) # TODO: is this correct for replanning? Do we need to adjust anything here? - self.traj_gen.set_duration(None if self.learn_sub_trajectories else np.array([self.duration]), np.array([self.dt])) + self.traj_gen.set_duration(None if self.learn_sub_trajectories else np.array([self.duration]), + np.array([self.dt])) traj_dict = self.traj_gen.get_trajs(get_pos=True, get_vel=True) trajectory_tensor, velocity_tensor = traj_dict['pos'], traj_dict['vel'] return get_numpy(trajectory_tensor), get_numpy(velocity_tensor) - def get_traj_gen_action_space(self): + def _get_traj_gen_action_space(self): """This function can be used to set up an individual space for the parameters of the traj_gen.""" min_action_bounds, max_action_bounds = self.traj_gen.get_param_bounds() - mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(), - dtype=np.float32) - return mp_action_space + action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(), + dtype=self.env.action_space.dtype) + return action_space - def get_action_space(self): + def _get_action_space(self): """ This function can be used to modify the action space for considering actions which are not learned via motion primitives. E.g. ball releasing time for the beer pong task. By default, it is the parameter space of the @@ -102,12 +102,21 @@ class BlackBoxWrapper(gym.ObservationWrapper): try: return self.traj_gen_action_space except AttributeError: - return self.get_traj_gen_action_space() + return self._get_traj_gen_action_space() + + def _get_observation_space(self): + mask = self.env.context_mask + if not self.return_context_observation: + # return full observation + mask = np.ones_like(mask, dtype=bool) + min_obs_bound = self.env.observation_space.low[mask] + max_obs_bound = self.env.observation_space.high[mask] + return spaces.Box(low=min_obs_bound, high=max_obs_bound, dtype=self.env.observation_space.dtype) def step(self, action: np.ndarray): """ This function generates a trajectory based on a MP and then does the usual loop over reset and step""" - # agent to learn when to release the ball + # TODO remove this part, right now only needed for beer pong mp_params, env_spec_params = self.env._episode_callback(action, self.traj_gen) trajectory, velocity = self.get_trajectory(mp_params) @@ -121,10 +130,8 @@ class BlackBoxWrapper(gym.ObservationWrapper): infos = dict() done = False - for t, pos_vel in enumerate(zip(trajectory, velocity)): - step_action = self.tracking_controller.get_action(pos_vel[0], pos_vel[1], self.current_pos, - self.current_vel) - step_action = self._step_callback(t, env_spec_params, step_action) # include possible callback info + for t, (pos, vel) in enumerate(zip(trajectory, velocity)): + step_action = self.tracking_controller.get_action(pos, vel, self.current_pos, self.current_vel) c_action = np.clip(step_action, self.env.action_space.low, self.env.action_space.high) # print('step/clipped action ratio: ', step_action/c_action) obs, c_reward, done, info = self.env.step(c_action) @@ -142,36 +149,34 @@ class BlackBoxWrapper(gym.ObservationWrapper): if self.render_kwargs: self.render(**self.render_kwargs) - if done: - break - - if self.replanning_schedule and self.replanning_schedule(self.current_pos, self.current_vel, obs, c_action, - t + 1 + self.current_traj_steps): + if done or self.replanning_schedule(self.current_pos, self.current_vel, obs, c_action, + t + 1 + self.current_traj_steps): break infos.update({k: v[:t + 1] for k, v in infos.items()}) self.current_traj_steps += t + 1 if self.verbose >= 2: - infos['trajectory'] = trajectory + infos['positions'] = trajectory + infos['velocities'] = velocity infos['step_actions'] = actions[:t + 1] infos['step_observations'] = observations[:t + 1] infos['step_rewards'] = rewards[:t + 1] infos['trajectory_length'] = t + 1 trajectory_return = self.reward_aggregation(rewards[:t + 1]) - return obs, trajectory_return, done, infos + return self.observation(obs), trajectory_return, done, infos def render(self, **kwargs): """Only set render options here, such that they can be used during the rollout. This only needs to be called once""" - self.render_kwargs = kwargs + self.render_kwargs = kwargs or self.render_kwargs # self.env.render(mode=self.render_mode, **self.render_kwargs) - self.env.render(**kwargs) + self.env.render(**self.render_kwargs) - def reset(self, **kwargs): + def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None): self.current_traj_steps = 0 - super(BlackBoxWrapper, self).reset(**kwargs) + return super(BlackBoxWrapper, self).reset(seed=seed, return_info=return_info, options=options) def plot_trajs(self, des_trajs, des_vels): import matplotlib.pyplot as plt diff --git a/alr_envs/black_box/raw_interface_wrapper.py b/alr_envs/black_box/raw_interface_wrapper.py index becbfe3..019a268 100644 --- a/alr_envs/black_box/raw_interface_wrapper.py +++ b/alr_envs/black_box/raw_interface_wrapper.py @@ -57,7 +57,8 @@ class RawInterfaceWrapper(gym.Wrapper): # return bool(self.replanning_model(s)) return False - def _episode_callback(self, action: np.ndarray, traj_gen: MPInterface) -> Tuple[np.ndarray, Union[np.ndarray, None]]: + def _episode_callback(self, action: np.ndarray, traj_gen: MPInterface) -> Tuple[ + np.ndarray, Union[np.ndarray, None]]: """ Used to extract the parameters for the motion primitive and other parameters from an action array which might include other actions like ball releasing time for the beer pong environment. @@ -70,20 +71,3 @@ class RawInterfaceWrapper(gym.Wrapper): Tuple: mp_arguments and other arguments """ return action, None - - def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[ - np.ndarray]: - """ - This function can be used to modify the step_action with additional parameters e.g. releasing the ball in the - Beerpong env. The parameters used should not be part of the motion primitive parameters. - Returns step_action by default, can be overwritten in individual mp_wrappers. - Args: - t: the current time step of the episode - env_spec_params: the environment specific parameter, as defined in function _episode_callback - (e.g. ball release time in Beer Pong) - step_action: the current step-based action - - Returns: - modified step action - """ - return step_action diff --git a/alr_envs/examples/examples_general.py b/alr_envs/examples/examples_general.py index d043d6d..4f184b8 100644 --- a/alr_envs/examples/examples_general.py +++ b/alr_envs/examples/examples_general.py @@ -87,10 +87,10 @@ if __name__ == '__main__': render = True # Basic gym task - example_general("Pendulum-v0", seed=10, iterations=200, render=render) + example_general("Reacher5d-v0", seed=10, iterations=200, render=render) # # Basis task from framework - example_general("alr_envs:HoleReacher-v0", seed=10, iterations=200, render=render) + example_general("Reacher-v0", seed=10, iterations=200, render=render) # # OpenAI Mujoco task example_general("HalfCheetah-v2", seed=10, render=render) diff --git a/alr_envs/examples/examples_movement_primitives.py b/alr_envs/examples/examples_movement_primitives.py index bf1f950..a2f5d54 100644 --- a/alr_envs/examples/examples_movement_primitives.py +++ b/alr_envs/examples/examples_movement_primitives.py @@ -45,7 +45,7 @@ def example_mp(env_name="alr_envs:HoleReacherDMP-v1", seed=1, iterations=1, rend obs = env.reset() -def example_custom_mp(env_name="alr_envs:HoleReacherDMP-v1", seed=1, iterations=1, render=True): +def example_custom_mp(env_name="Reacher5dProMP-v0", seed=1, iterations=1, render=True): """ Example for running a motion primitive based environment, which is already registered Args: @@ -57,32 +57,12 @@ def example_custom_mp(env_name="alr_envs:HoleReacherDMP-v1", seed=1, iterations= Returns: """ - # Changing the traj_gen_kwargs is possible by providing them to gym. - # E.g. here by providing way to many basis functions - # mp_dict = alr_envs.from_default_config('ALRReacher-v0', {'basis_generator_kwargs': {'num_basis': 10}}) - # mp_dict.update({'basis_generator_kwargs': {'num_basis': 10}}) + # Changing the arguments of the black box env is possible by providing them to gym as with all kwargs. + # E.g. here for way to many basis functions + env = alr_envs.make(env_name, seed, basis_generator_kwargs={'num_basis': 1000}) # mp_dict.update({'black_box_kwargs': {'learn_sub_trajectories': True}}) # mp_dict.update({'black_box_kwargs': {'do_replanning': lambda pos, vel, t: lambda t: t % 100}}) - # default env with promp and no learn_sub_trajectories and replanning - # env = alr_envs.make('ALRReacherProMP-v0', 1, n_links=7) - env = alr_envs.make('ALRReacherProMP-v0', 1, basis_generator_kwargs={'num_basis': 10}, n_links=7) - # env = alr_envs.make('ALRReacher-v0', seed=1, bb_kwargs=mp_dict, n_links=1) - # env = alr_envs.make_bb('ALRReacher-v0', **mp_dict) - - mp_kwargs = { - "num_dof": 5, - "num_basis": 1000, - "duration": 2, - "learn_goal": True, - "alpha_phase": 2, - "bandwidth_factor": 2, - "policy_type": "velocity", - "weights_scale": 50, - "goal_scale": 0.1 - } - env = alr_envs.make(env_name, seed, mp_kwargs=mp_kwargs) - # This time rendering every trajectory if render: env.render(mode="human") @@ -100,6 +80,7 @@ def example_custom_mp(env_name="alr_envs:HoleReacherDMP-v1", seed=1, iterations= print(rewards) rewards = 0 obs = env.reset() + print(obs) def example_fully_custom_mp(seed=1, iterations=1, render=True): @@ -169,7 +150,7 @@ if __name__ == '__main__': # example_mp("alr_envs:HoleReacherDetPMP-v1", seed=10, iterations=1, render=render) # Altered basis functions - example_custom_mp("alr_envs:HoleReacherDMP-v1", seed=10, iterations=1, render=render) + example_custom_mp("Reacher5dProMP-v0", seed=10, iterations=10, render=render) # Custom MP - example_fully_custom_mp(seed=10, iterations=1, render=render) + # example_fully_custom_mp(seed=10, iterations=1, render=render) diff --git a/alr_envs/utils/make_env_helpers.py b/alr_envs/utils/make_env_helpers.py index 0051546..ce24205 100644 --- a/alr_envs/utils/make_env_helpers.py +++ b/alr_envs/utils/make_env_helpers.py @@ -43,10 +43,10 @@ def make_rank(env_id: str, seed: int, rank: int = 0, return_callable=True, **kwa def make(env_id, seed, **kwargs): - # spec = registry.get(env_id) # TODO: This doesn't work with gym ==0.21.0 - spec = registry.spec(env_id) + # TODO: This doesn't work with gym ==0.21.0 # This access is required to allow for nested dict updates - all_kwargs = deepcopy(spec._kwargs) + spec = registry.get(env_id) + all_kwargs = deepcopy(spec.kwargs) nested_update(all_kwargs, kwargs) return _make(env_id, seed, **all_kwargs) @@ -148,7 +148,7 @@ def make_bb( phase_kwargs: kwargs for the phase generator controller_kwargs: kwargs for the tracking controller env_id: base_env_name, - wrappers: list of wrappers (at least an BlackBoxWrapper), + wrappers: list of wrappers (at least an RawInterfaceWrapper), seed: seed of environment traj_gen_kwargs: dict of at least {num_dof: int, num_basis: int} for DMP diff --git a/alr_envs/utils/utils.py b/alr_envs/utils/utils.py index cf09e24..f4fea74 100644 --- a/alr_envs/utils/utils.py +++ b/alr_envs/utils/utils.py @@ -1,4 +1,4 @@ -from collections import Mapping, MutableMapping +from collections.abc import Mapping, MutableMapping import numpy as np import torch as ch @@ -46,4 +46,5 @@ def nested_update(base: MutableMapping, update): """ for k, v in update.items(): base[k] = nested_update(base.get(k, {}), v) if isinstance(v, Mapping) else v + return base diff --git a/setup.py b/setup.py index 796c569..055ac81 100644 --- a/setup.py +++ b/setup.py @@ -1,24 +1,35 @@ +import itertools + from setuptools import setup +# Environment-specific dependencies for dmc and metaworld +extras = { + "dmc": ["dm_control"], + "meta": ["mujoco_py<2.2,>=2.1, git+https://github.com/rlworkgroup/metaworld.git@master#egg=metaworld"], + "mujoco": ["mujoco==2.2.0", "imageio>=2.14.1"], +} + +# All dependencies +all_groups = set(extras.keys()) +extras["all"] = list(set(itertools.chain.from_iterable(map(lambda group: extras[group], all_groups)))) + setup( - name='alr_envs', + author='Fabian Otto, Onur Celik, Marcel Sandermann, Maximilian Huettenrauch', + name='simple_gym', version='0.0.1', packages=['alr_envs', 'alr_envs.alr', 'alr_envs.open_ai', 'alr_envs.dmc', 'alr_envs.meta', 'alr_envs.utils'], install_requires=[ 'gym', 'PyQt5', - #'matplotlib', - #'mp_env_api @ git+https://github.com/ALRhub/motion_primitive_env_api.git', -# 'mp_env_api @ git+ssh://git@github.com/ALRhub/motion_primitive_env_api.git', + # 'matplotlib', + # 'mp_env_api @ git+https://github.com/ALRhub/motion_primitive_env_api.git', + # 'mp_env_api @ git+ssh://git@github.com/ALRhub/motion_primitive_env_api.git', 'mujoco-py<2.1,>=2.0', 'dm_control', 'metaworld @ git+https://github.com/rlworkgroup/metaworld.git@master#egg=metaworld', ], - url='https://github.com/ALRhub/alr_envs/', - license='MIT', - author='Fabian Otto, Marcel Sandermann, Maximilian Huettenrauch', + # license='AGPL-3.0 license', author_email='', - description='Custom Gym environments for various (robotics) tasks. integration of DMC environments into the' - 'gym interface, and support for using motion primitives with gym environments.' + description='Simple Gym: Aggregating interface for various RL environments with support for Black Box approaches.' )