diff --git a/fancy_gym/black_box/black_box_wrapper.py b/fancy_gym/black_box/black_box_wrapper.py index fd5032b..b997fd5 100644 --- a/fancy_gym/black_box/black_box_wrapper.py +++ b/fancy_gym/black_box/black_box_wrapper.py @@ -1,8 +1,9 @@ -from typing import Tuple, Optional, Callable +from typing import Tuple, Optional, Callable, Dict, Any -import gym +import gymnasium as gym import numpy as np -from gym import spaces +from gymnasium import spaces +from gymnasium.core import ObsType from mp_pytorch.mp.mp_interfaces import MPInterface from fancy_gym.black_box.controller.base_controller import BaseController @@ -59,7 +60,8 @@ class BlackBoxWrapper(gym.ObservationWrapper): self.reward_aggregation = reward_aggregation # spaces - self.return_context_observation = not (learn_sub_trajectories or self.do_replanning) + self.return_context_observation = not ( + learn_sub_trajectories or self.do_replanning) self.traj_gen_action_space = self._get_traj_gen_action_space() self.action_space = self._get_action_space() self.observation_space = self._get_observation_space() @@ -91,14 +93,17 @@ class BlackBoxWrapper(gym.ObservationWrapper): # If we do not do this, the traj_gen assumes we are continuing the trajectory. self.traj_gen.reset() - clipped_params = np.clip(action, self.traj_gen_action_space.low, self.traj_gen_action_space.high) + clipped_params = np.clip( + action, self.traj_gen_action_space.low, self.traj_gen_action_space.high) self.traj_gen.set_params(clipped_params) - init_time = np.array(0 if not self.do_replanning else self.current_traj_steps * self.dt) + init_time = np.array( + 0 if not self.do_replanning else self.current_traj_steps * self.dt) condition_pos = self.condition_pos if self.condition_pos is not None else self.current_pos condition_vel = self.condition_vel if self.condition_vel is not None else self.current_vel - self.traj_gen.set_initial_conditions(init_time, condition_pos, condition_vel) + self.traj_gen.set_initial_conditions( + init_time, condition_pos, condition_vel) self.traj_gen.set_duration(duration, self.dt) position = get_numpy(self.traj_gen.get_traj_pos()) @@ -138,13 +143,15 @@ class BlackBoxWrapper(gym.ObservationWrapper): """ This function generates a trajectory based on a MP and then does the usual loop over reset and step""" # TODO remove this part, right now only needed for beer pong - mp_params, env_spec_params = self.env.episode_callback(action, self.traj_gen) + mp_params, env_spec_params = self.env.episode_callback( + action, self.traj_gen) position, velocity = self.get_trajectory(mp_params) trajectory_length = len(position) rewards = np.zeros(shape=(trajectory_length,)) if self.verbose >= 2: - actions = np.zeros(shape=(trajectory_length,) + self.env.action_space.shape) + actions = np.zeros(shape=(trajectory_length,) + + self.env.action_space.shape) observations = np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape, dtype=self.env.observation_space.dtype) @@ -153,9 +160,12 @@ class BlackBoxWrapper(gym.ObservationWrapper): self.plan_steps += 1 for t, (pos, vel) in enumerate(zip(position, velocity)): - step_action = self.tracking_controller.get_action(pos, vel, self.current_pos, self.current_vel) - c_action = np.clip(step_action, self.env.action_space.low, self.env.action_space.high) - obs, c_reward, done, info = self.env.step(c_action) + step_action = self.tracking_controller.get_action( + pos, vel, self.current_pos, self.current_vel) + c_action = np.clip( + step_action, self.env.action_space.low, self.env.action_space.high) + obs, c_reward, terminated, truncated, info = self.env.step( + c_action) rewards[t] = c_reward if self.verbose >= 2: @@ -170,9 +180,9 @@ class BlackBoxWrapper(gym.ObservationWrapper): if self.render_kwargs: self.env.render(**self.render_kwargs) - if done or (self.replanning_schedule(self.current_pos, self.current_vel, obs, c_action, - t + 1 + self.current_traj_steps) - and self.plan_steps < self.max_planning_times): + if terminated or truncated or (self.replanning_schedule(self.current_pos, self.current_vel, obs, c_action, + t + 1 + self.current_traj_steps) + and self.plan_steps < self.max_planning_times): if self.condition_on_desired: self.condition_pos = pos @@ -192,17 +202,18 @@ class BlackBoxWrapper(gym.ObservationWrapper): infos['trajectory_length'] = t + 1 trajectory_return = self.reward_aggregation(rewards[:t + 1]) - return self.observation(obs), trajectory_return, done, infos + return self.observation(obs), trajectory_return, terminated, truncated, infos def render(self, **kwargs): """Only set render options here, such that they can be used during the rollout. This only needs to be called once""" self.render_kwargs = kwargs - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None): + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: self.current_traj_steps = 0 self.plan_steps = 0 self.traj_gen.reset() self.condition_pos = None self.condition_vel = None - return super(BlackBoxWrapper, self).reset() + return super(BlackBoxWrapper, self).reset(seed=seed, options=options) diff --git a/fancy_gym/black_box/raw_interface_wrapper.py b/fancy_gym/black_box/raw_interface_wrapper.py index 02945a1..b1a6aaa 100644 --- a/fancy_gym/black_box/raw_interface_wrapper.py +++ b/fancy_gym/black_box/raw_interface_wrapper.py @@ -1,6 +1,6 @@ from typing import Union, Tuple -import gym +import gymnasium as gym import numpy as np from mp_pytorch.mp.mp_interfaces import MPInterface diff --git a/fancy_gym/dmc/__init__.py b/fancy_gym/dmc/__init__.py index 397e6fa..5d7466c 100644 --- a/fancy_gym/dmc/__init__.py +++ b/fancy_gym/dmc/__init__.py @@ -1,14 +1,16 @@ from copy import deepcopy +from gymnasium.wrappers import FlattenObservation + from . import manipulation, suite ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS = {"DMP": [], "ProMP": [], "ProDMP": []} -from gym.envs.registration import register +from gymnasium.envs.registration import register DEFAULT_BB_DICT_ProMP = { "name": 'EnvName', - "wrappers": [], + "wrappers": [FlattenObservation], "trajectory_generator_kwargs": { 'trajectory_generator_type': 'promp' }, @@ -29,7 +31,7 @@ DEFAULT_BB_DICT_ProMP = { DEFAULT_BB_DICT_DMP = { "name": 'EnvName', - "wrappers": [], + "wrappers": [FlattenObservation], "trajectory_generator_kwargs": { 'trajectory_generator_type': 'dmp' }, @@ -49,7 +51,7 @@ DEFAULT_BB_DICT_DMP = { # DeepMind Control Suite (DMC) kwargs_dict_bic_dmp = deepcopy(DEFAULT_BB_DICT_DMP) -kwargs_dict_bic_dmp['name'] = f"dmc:ball_in_cup-catch" +kwargs_dict_bic_dmp['name'] = f"dm_control/ball_in_cup-catch-v0" kwargs_dict_bic_dmp['wrappers'].append(suite.ball_in_cup.MPWrapper) # bandwidth_factor=2 kwargs_dict_bic_dmp['phase_generator_kwargs']['alpha_phase'] = 2 @@ -62,7 +64,7 @@ register( ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS["DMP"].append("dmc_ball_in_cup-catch_dmp-v0") kwargs_dict_bic_promp = deepcopy(DEFAULT_BB_DICT_DMP) -kwargs_dict_bic_promp['name'] = f"dmc:ball_in_cup-catch" +kwargs_dict_bic_promp['name'] = f"dm_control/ball_in_cup-catch-v0" kwargs_dict_bic_promp['wrappers'].append(suite.ball_in_cup.MPWrapper) register( id=f'dmc_ball_in_cup-catch_promp-v0', @@ -72,7 +74,7 @@ register( ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS["ProMP"].append("dmc_ball_in_cup-catch_promp-v0") kwargs_dict_reacher_easy_dmp = deepcopy(DEFAULT_BB_DICT_DMP) -kwargs_dict_reacher_easy_dmp['name'] = f"dmc:reacher-easy" +kwargs_dict_reacher_easy_dmp['name'] = f"dm_control/reacher-easy-v0" kwargs_dict_reacher_easy_dmp['wrappers'].append(suite.reacher.MPWrapper) # bandwidth_factor=2 kwargs_dict_reacher_easy_dmp['phase_generator_kwargs']['alpha_phase'] = 2 @@ -86,7 +88,7 @@ register( ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS["DMP"].append("dmc_reacher-easy_dmp-v0") kwargs_dict_reacher_easy_promp = deepcopy(DEFAULT_BB_DICT_DMP) -kwargs_dict_reacher_easy_promp['name'] = f"dmc:reacher-easy" +kwargs_dict_reacher_easy_promp['name'] = f"dm_control/reacher-easy-v0" kwargs_dict_reacher_easy_promp['wrappers'].append(suite.reacher.MPWrapper) kwargs_dict_reacher_easy_promp['trajectory_generator_kwargs']['weight_scale'] = 0.2 register( @@ -97,7 +99,7 @@ register( ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS["ProMP"].append("dmc_reacher-easy_promp-v0") kwargs_dict_reacher_hard_dmp = deepcopy(DEFAULT_BB_DICT_DMP) -kwargs_dict_reacher_hard_dmp['name'] = f"dmc:reacher-hard" +kwargs_dict_reacher_hard_dmp['name'] = f"dm_control/reacher-hard-v0" kwargs_dict_reacher_hard_dmp['wrappers'].append(suite.reacher.MPWrapper) # bandwidth_factor = 2 kwargs_dict_reacher_hard_dmp['phase_generator_kwargs']['alpha_phase'] = 2 @@ -111,7 +113,7 @@ register( ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS["DMP"].append("dmc_reacher-hard_dmp-v0") kwargs_dict_reacher_hard_promp = deepcopy(DEFAULT_BB_DICT_DMP) -kwargs_dict_reacher_hard_promp['name'] = f"dmc:reacher-hard" +kwargs_dict_reacher_hard_promp['name'] = f"dm_control/reacher-hard-v0" kwargs_dict_reacher_hard_promp['wrappers'].append(suite.reacher.MPWrapper) kwargs_dict_reacher_hard_promp['trajectory_generator_kwargs']['weight_scale'] = 0.2 register( @@ -126,7 +128,7 @@ _dmc_cartpole_tasks = ["balance", "balance_sparse", "swingup", "swingup_sparse"] for _task in _dmc_cartpole_tasks: _env_id = f'dmc_cartpole-{_task}_dmp-v0' kwargs_dict_cartpole_dmp = deepcopy(DEFAULT_BB_DICT_DMP) - kwargs_dict_cartpole_dmp['name'] = f"dmc:cartpole-{_task}" + kwargs_dict_cartpole_dmp['name'] = f"dm_control/cartpole-{_task}-v0" kwargs_dict_cartpole_dmp['wrappers'].append(suite.cartpole.MPWrapper) # bandwidth_factor = 2 kwargs_dict_cartpole_dmp['phase_generator_kwargs']['alpha_phase'] = 2 @@ -143,7 +145,7 @@ for _task in _dmc_cartpole_tasks: _env_id = f'dmc_cartpole-{_task}_promp-v0' kwargs_dict_cartpole_promp = deepcopy(DEFAULT_BB_DICT_DMP) - kwargs_dict_cartpole_promp['name'] = f"dmc:cartpole-{_task}" + kwargs_dict_cartpole_promp['name'] = f"dm_control/cartpole-{_task}-v0" kwargs_dict_cartpole_promp['wrappers'].append(suite.cartpole.MPWrapper) kwargs_dict_cartpole_promp['controller_kwargs']['p_gains'] = 10 kwargs_dict_cartpole_promp['controller_kwargs']['d_gains'] = 10 @@ -156,7 +158,7 @@ for _task in _dmc_cartpole_tasks: ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) kwargs_dict_cartpole2poles_dmp = deepcopy(DEFAULT_BB_DICT_DMP) -kwargs_dict_cartpole2poles_dmp['name'] = f"dmc:cartpole-two_poles" +kwargs_dict_cartpole2poles_dmp['name'] = f"dm_control/cartpole-two_poles-v0" kwargs_dict_cartpole2poles_dmp['wrappers'].append(suite.cartpole.TwoPolesMPWrapper) # bandwidth_factor = 2 kwargs_dict_cartpole2poles_dmp['phase_generator_kwargs']['alpha_phase'] = 2 @@ -173,7 +175,7 @@ register( ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS["DMP"].append(_env_id) kwargs_dict_cartpole2poles_promp = deepcopy(DEFAULT_BB_DICT_DMP) -kwargs_dict_cartpole2poles_promp['name'] = f"dmc:cartpole-two_poles" +kwargs_dict_cartpole2poles_promp['name'] = f"dm_control/cartpole-two_poles-v0" kwargs_dict_cartpole2poles_promp['wrappers'].append(suite.cartpole.TwoPolesMPWrapper) kwargs_dict_cartpole2poles_promp['controller_kwargs']['p_gains'] = 10 kwargs_dict_cartpole2poles_promp['controller_kwargs']['d_gains'] = 10 @@ -187,7 +189,7 @@ register( ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) kwargs_dict_cartpole3poles_dmp = deepcopy(DEFAULT_BB_DICT_DMP) -kwargs_dict_cartpole3poles_dmp['name'] = f"dmc:cartpole-three_poles" +kwargs_dict_cartpole3poles_dmp['name'] = f"dm_control/cartpole-three_poles-v0" kwargs_dict_cartpole3poles_dmp['wrappers'].append(suite.cartpole.ThreePolesMPWrapper) # bandwidth_factor = 2 kwargs_dict_cartpole3poles_dmp['phase_generator_kwargs']['alpha_phase'] = 2 @@ -204,7 +206,7 @@ register( ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS["DMP"].append(_env_id) kwargs_dict_cartpole3poles_promp = deepcopy(DEFAULT_BB_DICT_DMP) -kwargs_dict_cartpole3poles_promp['name'] = f"dmc:cartpole-three_poles" +kwargs_dict_cartpole3poles_promp['name'] = f"dm_control/cartpole-three_poles-v0" kwargs_dict_cartpole3poles_promp['wrappers'].append(suite.cartpole.ThreePolesMPWrapper) kwargs_dict_cartpole3poles_promp['controller_kwargs']['p_gains'] = 10 kwargs_dict_cartpole3poles_promp['controller_kwargs']['d_gains'] = 10 @@ -219,7 +221,7 @@ ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) # DeepMind Manipulation kwargs_dict_mani_reach_site_features_dmp = deepcopy(DEFAULT_BB_DICT_DMP) -kwargs_dict_mani_reach_site_features_dmp['name'] = f"dmc:manipulation-reach_site_features" +kwargs_dict_mani_reach_site_features_dmp['name'] = f"dm_control/reach_site_features-v0" kwargs_dict_mani_reach_site_features_dmp['wrappers'].append(manipulation.reach_site.MPWrapper) kwargs_dict_mani_reach_site_features_dmp['phase_generator_kwargs']['alpha_phase'] = 2 # TODO: weight scale 50, but goal scale 0.1 @@ -233,7 +235,7 @@ register( ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS["DMP"].append("dmc_manipulation-reach_site_dmp-v0") kwargs_dict_mani_reach_site_features_promp = deepcopy(DEFAULT_BB_DICT_DMP) -kwargs_dict_mani_reach_site_features_promp['name'] = f"dmc:manipulation-reach_site_features" +kwargs_dict_mani_reach_site_features_promp['name'] = f"dm_control/reach_site_features-v0" kwargs_dict_mani_reach_site_features_promp['wrappers'].append(manipulation.reach_site.MPWrapper) kwargs_dict_mani_reach_site_features_promp['trajectory_generator_kwargs']['weight_scale'] = 0.2 kwargs_dict_mani_reach_site_features_promp['controller_kwargs']['controller_type'] = 'velocity' diff --git a/fancy_gym/dmc/dmc_wrapper.py b/fancy_gym/dmc/dmc_wrapper.py index b1522c3..d1e5f0d 100644 --- a/fancy_gym/dmc/dmc_wrapper.py +++ b/fancy_gym/dmc/dmc_wrapper.py @@ -3,15 +3,15 @@ # Copyright (c) 2020 Denis Yarats import collections from collections.abc import MutableMapping -from typing import Any, Dict, Tuple, Optional, Union, Callable +from typing import Any, Dict, Tuple, Optional, Union, Callable, SupportsFloat -import gym +import gymnasium as gym import numpy as np from dm_control import composer from dm_control.rl import control from dm_env import specs -from gym import spaces -from gym.core import ObsType +from gymnasium import spaces +from gymnasium.core import ObsType, ActType def _spec_to_box(spec): @@ -100,23 +100,23 @@ class DMCWrapper(gym.Env): self._action_space.seed(seed) self._observation_space.seed(seed) - def step(self, action) -> Tuple[np.ndarray, float, bool, Dict[str, Any]]: + def step(self, action: ActType) -> Tuple[ObsType, SupportsFloat, bool, bool, Dict[str, Any]]: assert self._action_space.contains(action) extra = {'internal_state': self._env.physics.get_state().copy()} - time_step = self._env.step(action) reward = time_step.reward or 0. - done = time_step.last() + terminated = False + truncated = time_step.last() and time_step.discount > 0 obs = self._get_obs(time_step) extra['discount'] = time_step.discount - return obs, reward, done, extra + return obs, reward, terminated, truncated, extra - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, - options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: time_step = self._env.reset() obs = self._get_obs(time_step) - return obs + return obs, {} def render(self, mode='rgb_array', height=240, width=320, camera_id=-1, overlays=(), depth=False, segmentation=False, scene_option=None, render_flag_overrides=None): diff --git a/fancy_gym/dmc/manipulation/reach_site/mp_wrapper.py b/fancy_gym/dmc/manipulation/reach_site/mp_wrapper.py index f64ac4a..908cee1 100644 --- a/fancy_gym/dmc/manipulation/reach_site/mp_wrapper.py +++ b/fancy_gym/dmc/manipulation/reach_site/mp_wrapper.py @@ -35,4 +35,4 @@ class MPWrapper(RawInterfaceWrapper): @property def dt(self) -> Union[float, int]: - return self.env.dt + return self.env.control_timestep() diff --git a/fancy_gym/dmc/suite/ball_in_cup/mp_wrapper.py b/fancy_gym/dmc/suite/ball_in_cup/mp_wrapper.py index dc6a539..94f9041 100644 --- a/fancy_gym/dmc/suite/ball_in_cup/mp_wrapper.py +++ b/fancy_gym/dmc/suite/ball_in_cup/mp_wrapper.py @@ -31,4 +31,4 @@ class MPWrapper(RawInterfaceWrapper): @property def dt(self) -> Union[float, int]: - return self.env.dt + return self.env.control_timestep() diff --git a/fancy_gym/dmc/suite/cartpole/mp_wrapper.py b/fancy_gym/dmc/suite/cartpole/mp_wrapper.py index 7edd51f..85afa83 100644 --- a/fancy_gym/dmc/suite/cartpole/mp_wrapper.py +++ b/fancy_gym/dmc/suite/cartpole/mp_wrapper.py @@ -35,7 +35,7 @@ class MPWrapper(RawInterfaceWrapper): @property def dt(self) -> Union[float, int]: - return self.env.dt + return self.env.control_timestep() class TwoPolesMPWrapper(MPWrapper): diff --git a/fancy_gym/dmc/suite/reacher/mp_wrapper.py b/fancy_gym/dmc/suite/reacher/mp_wrapper.py index 5ac52e5..2d0aee5 100644 --- a/fancy_gym/dmc/suite/reacher/mp_wrapper.py +++ b/fancy_gym/dmc/suite/reacher/mp_wrapper.py @@ -30,4 +30,4 @@ class MPWrapper(RawInterfaceWrapper): @property def dt(self) -> Union[float, int]: - return self.env.dt + return self.env.control_timestep() diff --git a/fancy_gym/envs/__init__.py b/fancy_gym/envs/__init__.py index d504990..af66cec 100644 --- a/fancy_gym/envs/__init__.py +++ b/fancy_gym/envs/__init__.py @@ -1,7 +1,7 @@ from copy import deepcopy import numpy as np -from gym import register +from gymnasium import register from . import classic_control, mujoco from .classic_control.hole_reacher.hole_reacher import HoleReacherEnv diff --git a/fancy_gym/envs/classic_control/base_reacher/base_reacher.py b/fancy_gym/envs/classic_control/base_reacher/base_reacher.py index f2ba135..18305fd 100644 --- a/fancy_gym/envs/classic_control/base_reacher/base_reacher.py +++ b/fancy_gym/envs/classic_control/base_reacher/base_reacher.py @@ -1,10 +1,10 @@ -from typing import Union, Tuple, Optional +from typing import Union, Tuple, Optional, Any, Dict -import gym +import gymnasium as gym import numpy as np -from gym import spaces -from gym.core import ObsType -from gym.utils import seeding +from gymnasium import spaces +from gymnasium.core import ObsType +from gymnasium.utils import seeding from fancy_gym.envs.classic_control.utils import intersect @@ -55,7 +55,6 @@ class BaseReacherEnv(gym.Env): self.fig = None self._steps = 0 - self.seed() @property def dt(self) -> Union[float, int]: @@ -69,10 +68,15 @@ class BaseReacherEnv(gym.Env): def current_vel(self): return self._angle_velocity.copy() - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, - options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: # Sample only orientation of first link, i.e. the arm is always straight. - if self.random_start: + super(BaseReacherEnv, self).reset(seed=seed, options=options) + try: + random_start = options.get('random_start', self.random_start) + except AttributeError: + random_start = self.random_start + if random_start: first_joint = self.np_random.uniform(np.pi / 4, 3 * np.pi / 4) self._joint_angles = np.hstack([[first_joint], np.zeros(self.n_links - 1)]) self._start_pos = self._joint_angles.copy() @@ -84,7 +88,7 @@ class BaseReacherEnv(gym.Env): self._update_joints() self._steps = 0 - return self._get_obs().copy() + return self._get_obs().copy(), {} def _update_joints(self): """ @@ -124,10 +128,6 @@ class BaseReacherEnv(gym.Env): def _terminate(self, info) -> bool: raise NotImplementedError - def seed(self, seed=None): - self.np_random, seed = seeding.np_random(seed) - return [seed] - def close(self): super(BaseReacherEnv, self).close() del self.fig diff --git a/fancy_gym/envs/classic_control/base_reacher/base_reacher_direct.py b/fancy_gym/envs/classic_control/base_reacher/base_reacher_direct.py index ab21b39..6878922 100644 --- a/fancy_gym/envs/classic_control/base_reacher/base_reacher_direct.py +++ b/fancy_gym/envs/classic_control/base_reacher/base_reacher_direct.py @@ -1,5 +1,5 @@ import numpy as np -from gym import spaces +from gymnasium import spaces from fancy_gym.envs.classic_control.base_reacher.base_reacher import BaseReacherEnv @@ -32,6 +32,7 @@ class BaseReacherDirectEnv(BaseReacherEnv): reward, info = self._get_reward(action) self._steps += 1 - done = self._terminate(info) + terminated = self._terminate(info) + truncated = False - return self._get_obs().copy(), reward, done, info + return self._get_obs().copy(), reward, terminated, truncated, info diff --git a/fancy_gym/envs/classic_control/base_reacher/base_reacher_torque.py b/fancy_gym/envs/classic_control/base_reacher/base_reacher_torque.py index 7364948..c9a7d4f 100644 --- a/fancy_gym/envs/classic_control/base_reacher/base_reacher_torque.py +++ b/fancy_gym/envs/classic_control/base_reacher/base_reacher_torque.py @@ -1,5 +1,5 @@ import numpy as np -from gym import spaces +from gymnasium import spaces from fancy_gym.envs.classic_control.base_reacher.base_reacher import BaseReacherEnv @@ -31,6 +31,7 @@ class BaseReacherTorqueEnv(BaseReacherEnv): reward, info = self._get_reward(action) self._steps += 1 - done = False + terminated = False + truncated = False - return self._get_obs().copy(), reward, done, info + return self._get_obs().copy(), reward, terminated, truncated, info diff --git a/fancy_gym/envs/classic_control/hole_reacher/hole_reacher.py b/fancy_gym/envs/classic_control/hole_reacher/hole_reacher.py index 5563ea6..0ed03f2 100644 --- a/fancy_gym/envs/classic_control/hole_reacher/hole_reacher.py +++ b/fancy_gym/envs/classic_control/hole_reacher/hole_reacher.py @@ -1,9 +1,10 @@ -from typing import Union, Optional, Tuple +from typing import Union, Optional, Tuple, Any, Dict -import gym +import gymnasium as gym import matplotlib.pyplot as plt import numpy as np -from gym.core import ObsType +from gymnasium import spaces +from gymnasium.core import ObsType from matplotlib import patches from fancy_gym.envs.classic_control.base_reacher.base_reacher_direct import BaseReacherDirectEnv @@ -40,7 +41,7 @@ class HoleReacherEnv(BaseReacherDirectEnv): [np.inf] # env steps, because reward start after n steps TODO: Maybe ]) # self.action_space = gym.spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape) - self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) + self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) if rew_fct == "simple": from fancy_gym.envs.classic_control.hole_reacher.hr_simple_reward import HolereacherReward @@ -54,13 +55,18 @@ class HoleReacherEnv(BaseReacherDirectEnv): else: raise ValueError("Unknown reward function {}".format(rew_fct)) - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, - options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: + + # initialize seed here as the random goal needs to be generated before the super reset() + gym.Env.reset(self, seed=seed, options=options) + self._generate_hole() self._set_patches() self.reward_function.reset() - return super().reset() + # do not provide seed to avoid setting it twice + return super(HoleReacherEnv, self).reset(options=options) def _get_reward(self, action: np.ndarray) -> (float, dict): return self.reward_function.get_reward(self) @@ -223,16 +229,3 @@ class HoleReacherEnv(BaseReacherDirectEnv): self.fig.gca().add_patch(left_block) self.fig.gca().add_patch(right_block) self.fig.gca().add_patch(hole_floor) - - -if __name__ == "__main__": - - env = HoleReacherEnv(5) - env.reset() - - for i in range(10000): - ac = env.action_space.sample() - obs, rew, done, info = env.step(ac) - env.render() - if done: - env.reset() diff --git a/fancy_gym/envs/classic_control/simple_reacher/simple_reacher.py b/fancy_gym/envs/classic_control/simple_reacher/simple_reacher.py index 9b03147..4ef25ea 100644 --- a/fancy_gym/envs/classic_control/simple_reacher/simple_reacher.py +++ b/fancy_gym/envs/classic_control/simple_reacher/simple_reacher.py @@ -1,9 +1,9 @@ -from typing import Iterable, Union, Optional, Tuple +from typing import Iterable, Union, Optional, Tuple, Any, Dict import matplotlib.pyplot as plt import numpy as np -from gym import spaces -from gym.core import ObsType +from gymnasium import spaces +from gymnasium.core import ObsType from fancy_gym.envs.classic_control.base_reacher.base_reacher_torque import BaseReacherTorqueEnv @@ -42,11 +42,10 @@ class SimpleReacherEnv(BaseReacherTorqueEnv): # def start_pos(self): # return self._start_pos - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, - options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: self._generate_goal() - - return super().reset() + return super().reset(seed=seed, options=options) def _get_reward(self, action: np.ndarray): diff = self.end_effector - self._goal @@ -128,14 +127,3 @@ class SimpleReacherEnv(BaseReacherTorqueEnv): self.fig.canvas.draw() self.fig.canvas.flush_events() - -if __name__ == "__main__": - env = SimpleReacherEnv(5) - env.reset() - for i in range(200): - ac = env.action_space.sample() - obs, rew, done, info = env.step(ac) - - env.render() - if done: - break diff --git a/fancy_gym/envs/classic_control/viapoint_reacher/viapoint_reacher.py b/fancy_gym/envs/classic_control/viapoint_reacher/viapoint_reacher.py index f3412ac..ba5efd2 100644 --- a/fancy_gym/envs/classic_control/viapoint_reacher/viapoint_reacher.py +++ b/fancy_gym/envs/classic_control/viapoint_reacher/viapoint_reacher.py @@ -1,9 +1,10 @@ -from typing import Iterable, Union, Tuple, Optional +from typing import Iterable, Union, Tuple, Optional, Any, Dict -import gym +import gymnasium as gym import matplotlib.pyplot as plt import numpy as np -from gym.core import ObsType +from gymnasium import spaces +from gymnasium.core import ObsType from fancy_gym.envs.classic_control.base_reacher.base_reacher_direct import BaseReacherDirectEnv @@ -34,16 +35,16 @@ class ViaPointReacherEnv(BaseReacherDirectEnv): [np.inf] * 2, # x-y coordinates of target distance [np.inf] # env steps, because reward start after n steps ]) - self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) + self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) # @property # def start_pos(self): # return self._start_pos - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, - options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: self._generate_goal() - return super().reset() + return super().reset(seed=seed, options=options) def _generate_goal(self): # TODO: Maybe improve this later, this can yield quite a lot of invalid settings @@ -185,14 +186,3 @@ class ViaPointReacherEnv(BaseReacherDirectEnv): plt.pause(0.01) -if __name__ == "__main__": - - env = ViaPointReacherEnv(5) - env.reset() - - for i in range(10000): - ac = env.action_space.sample() - obs, rew, done, info = env.step(ac) - env.render() - if done: - env.reset() diff --git a/fancy_gym/envs/mujoco/ant_jump/ant_jump.py b/fancy_gym/envs/mujoco/ant_jump/ant_jump.py index 9311ae1..fbf0804 100644 --- a/fancy_gym/envs/mujoco/ant_jump/ant_jump.py +++ b/fancy_gym/envs/mujoco/ant_jump/ant_jump.py @@ -1,8 +1,8 @@ -from typing import Tuple, Union, Optional +from typing import Tuple, Union, Optional, Any, Dict import numpy as np -from gym.core import ObsType -from gym.envs.mujoco.ant_v4 import AntEnv +from gymnasium.core import ObsType +from gymnasium.envs.mujoco.ant_v4 import AntEnv MAX_EPISODE_STEPS_ANTJUMP = 200 @@ -61,9 +61,10 @@ class AntJumpEnv(AntEnv): costs = ctrl_cost + contact_cost - done = bool(height < 0.3) # fall over -> is the 0.3 value from healthy_z_range? TODO change 0.3 to the value of healthy z angle + terminated = bool( + height < 0.3) # fall over -> is the 0.3 value from healthy_z_range? TODO change 0.3 to the value of healthy z angle - if self.current_step == MAX_EPISODE_STEPS_ANTJUMP or done: + if self.current_step == MAX_EPISODE_STEPS_ANTJUMP or terminated: # -10 for scaling the value of the distance between the max_height and the goal height; only used when context is enabled # height_reward = -10 * (np.linalg.norm(self.max_height - self.goal)) height_reward = -10 * np.linalg.norm(self.max_height - self.goal) @@ -80,19 +81,20 @@ class AntJumpEnv(AntEnv): 'max_height': self.max_height, 'goal': self.goal } + truncated = False - return obs, reward, done, info + return obs, reward, terminated, truncated, info def _get_obs(self): return np.append(super()._get_obs(), self.goal) - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, - options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: self.current_step = 0 self.max_height = 0 # goal heights from 1.0 to 2.5; can be increased, but didnt work well with CMORE self.goal = self.np_random.uniform(1.0, 2.5, 1) - return super().reset() + return super().reset(seed=seed, options=options) # reset_model had to be implemented in every env to make it deterministic def reset_model(self): diff --git a/fancy_gym/envs/mujoco/beerpong/beerpong.py b/fancy_gym/envs/mujoco/beerpong/beerpong.py index 368425d..6a37e66 100644 --- a/fancy_gym/envs/mujoco/beerpong/beerpong.py +++ b/fancy_gym/envs/mujoco/beerpong/beerpong.py @@ -1,9 +1,10 @@ import os -from typing import Optional +from typing import Optional, Any, Dict, Tuple import numpy as np -from gym import utils -from gym.envs.mujoco import MujocoEnv +from gymnasium import utils +from gymnasium.core import ObsType +from gymnasium.envs.mujoco import MujocoEnv MAX_EPISODE_STEPS_BEERPONG = 300 FIXED_RELEASE_STEP = 62 # empirically evaluated for frame_skip=2! @@ -30,7 +31,7 @@ CUP_COLLISION_OBJ = ["cup_geom_table3", "cup_geom_table4", "cup_geom_table5", "c class BeerPongEnv(MujocoEnv, utils.EzPickle): - def __init__(self): + def __init__(self, **kwargs): self._steps = 0 # Small Context -> Easier. Todo: Should we do different versions? # self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "beerpong_wo_cup.xml") @@ -65,7 +66,13 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): self.ball_in_cup = False self.dist_ground_cup = -1 # distance floor to cup if first floor contact - MujocoEnv.__init__(self, model_path=self.xml_path, frame_skip=1, mujoco_bindings="mujoco") + MujocoEnv.__init__( + self, + self.xml_path, + frame_skip=1, + observation_space=self.observation_space, + **kwargs + ) utils.EzPickle.__init__(self) @property @@ -76,7 +83,8 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): def start_vel(self): return self._start_vel - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None): + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: self.dists = [] self.dists_final = [] self.action_costs = [] @@ -86,7 +94,7 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): self.ball_cup_contact = False self.ball_in_cup = False self.dist_ground_cup = -1 # distance floor to cup if first floor contact - return super().reset() + return super().reset(seed=seed, options=options) def reset_model(self): init_pos_all = self.init_qpos.copy() @@ -128,11 +136,11 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): if not crash: reward, reward_infos = self._get_reward(applied_action) is_collided = reward_infos['is_collided'] # TODO: Remove if self collision does not make a difference - done = is_collided + terminated = is_collided self._steps += 1 else: reward = -30 - done = True + terminated = True reward_infos = {"success": False, "ball_pos": np.zeros(3), "ball_vel": np.zeros(3), "is_collided": False} infos = dict( @@ -142,7 +150,10 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): q_vel=self.data.qvel[0:7].ravel().copy(), sim_crash=crash, ) infos.update(reward_infos) - return ob, reward, done, infos + + truncated = False + + return ob, reward, terminated, truncated, infos def _get_obs(self): theta = self.data.qpos.flat[:7].copy() @@ -258,9 +269,9 @@ class BeerPongEnvStepBasedEpisodicReward(BeerPongEnv): return super(BeerPongEnvStepBasedEpisodicReward, self).step(a) else: reward = 0 - done = True + terminated, truncated = True, False while self._steps < MAX_EPISODE_STEPS_BEERPONG: - obs, sub_reward, done, infos = super(BeerPongEnvStepBasedEpisodicReward, self).step( + obs, sub_reward, terminated, truncated, infos = super(BeerPongEnvStepBasedEpisodicReward, self).step( np.zeros(a.shape)) reward += sub_reward - return obs, reward, done, infos + return obs, reward, terminated, truncated, infos diff --git a/fancy_gym/envs/mujoco/beerpong/deprecated/beerpong.py b/fancy_gym/envs/mujoco/beerpong/deprecated/beerpong.py index 015e887..2fc98ba 100644 --- a/fancy_gym/envs/mujoco/beerpong/deprecated/beerpong.py +++ b/fancy_gym/envs/mujoco/beerpong/deprecated/beerpong.py @@ -2,8 +2,8 @@ import os import mujoco_py.builder import numpy as np -from gym import utils -from gym.envs.mujoco import MujocoEnv +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv from fancy_gym.envs.mujoco.beerpong.deprecated.beerpong_reward_staged import BeerPongReward @@ -90,11 +90,11 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): if not crash: reward, reward_infos = self.reward_function.compute_reward(self, applied_action) is_collided = reward_infos['is_collided'] - done = is_collided or self._steps == self.ep_length - 1 + terminated = is_collided or self._steps == self.ep_length - 1 self._steps += 1 else: reward = -30 - done = True + terminated = True reward_infos = {"success": False, "ball_pos": np.zeros(3), "ball_vel": np.zeros(3), "is_collided": False} infos = dict( @@ -104,7 +104,7 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): q_vel=self.sim.data.qvel[0:7].ravel().copy(), sim_crash=crash, ) infos.update(reward_infos) - return ob, reward, done, infos + return ob, reward, terminated, infos def _get_obs(self): theta = self.sim.data.qpos.flat[:7] @@ -143,16 +143,16 @@ class BeerPongEnvStepBasedEpisodicReward(BeerPongEnv): return super(BeerPongEnvStepBasedEpisodicReward, self).step(a) else: reward = 0 - done = False - while not done: - sub_ob, sub_reward, done, sub_infos = super(BeerPongEnvStepBasedEpisodicReward, self).step( - np.zeros(a.shape)) + terminated, truncated = False, False + while not (terminated or truncated): + sub_ob, sub_reward, terminated, truncated, sub_infos = super(BeerPongEnvStepBasedEpisodicReward, + self).step(np.zeros(a.shape)) reward += sub_reward infos = sub_infos ob = sub_ob ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the # internal steps and thus, the observation also needs to be set correctly - return ob, reward, done, infos + return ob, reward, terminated, truncated, infos # class BeerBongEnvStepBased(BeerBongEnv): @@ -186,27 +186,3 @@ class BeerPongEnvStepBasedEpisodicReward(BeerPongEnv): # ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the # # internal steps and thus, the observation also needs to be set correctly # return ob, reward, done, infos - - -if __name__ == "__main__": - env = BeerPongEnv(frame_skip=2) - env.seed(0) - # env = BeerBongEnvStepBased(frame_skip=2) - # env = BeerBongEnvStepBasedEpisodicReward(frame_skip=2) - # env = BeerBongEnvFixedReleaseStep(frame_skip=2) - import time - - env.reset() - env.render("human") - for i in range(600): - # ac = 10 * env.action_space.sample() - ac = 0.05 * np.ones(7) - obs, rew, d, info = env.step(ac) - env.render("human") - - if d: - print('reward:', rew) - print('RESETTING') - env.reset() - time.sleep(1) - env.close() diff --git a/fancy_gym/envs/mujoco/half_cheetah_jump/half_cheetah_jump.py b/fancy_gym/envs/mujoco/half_cheetah_jump/half_cheetah_jump.py index e0a5982..853c5e7 100644 --- a/fancy_gym/envs/mujoco/half_cheetah_jump/half_cheetah_jump.py +++ b/fancy_gym/envs/mujoco/half_cheetah_jump/half_cheetah_jump.py @@ -1,9 +1,9 @@ import os -from typing import Tuple, Union, Optional +from typing import Tuple, Union, Optional, Any, Dict import numpy as np -from gym.core import ObsType -from gym.envs.mujoco.half_cheetah_v4 import HalfCheetahEnv +from gymnasium.core import ObsType +from gymnasium.envs.mujoco.half_cheetah_v4 import HalfCheetahEnv MAX_EPISODE_STEPS_HALFCHEETAHJUMP = 100 @@ -44,7 +44,8 @@ class HalfCheetahJumpEnv(HalfCheetahEnv): ## Didnt use fell_over, because base env also has no done condition - Paul and Marc # fell_over = abs(self.sim.data.qpos[2]) > 2.5 # how to figure out if the cheetah fell over? -> 2.5 oke? # TODO: Should a fall over be checked here? - done = False + terminated = False + truncated = False ctrl_cost = self.control_cost(action) costs = ctrl_cost @@ -63,17 +64,17 @@ class HalfCheetahJumpEnv(HalfCheetahEnv): 'max_height': self.max_height } - return observation, reward, done, info + return observation, reward, terminated, truncated, info def _get_obs(self): return np.append(super()._get_obs(), self.goal) - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, - options: Optional[dict] = None, ) -> Union[ObsType, Tuple[ObsType, dict]]: + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: self.max_height = 0 self.current_step = 0 self.goal = self.np_random.uniform(1.1, 1.6, 1) # 1.1 1.6 - return super().reset() + return super().reset(seed=seed, options=options) # overwrite reset_model to make it deterministic def reset_model(self): diff --git a/fancy_gym/envs/mujoco/hopper_jump/hopper_jump.py b/fancy_gym/envs/mujoco/hopper_jump/hopper_jump.py index da9ac4d..8ee4b11 100644 --- a/fancy_gym/envs/mujoco/hopper_jump/hopper_jump.py +++ b/fancy_gym/envs/mujoco/hopper_jump/hopper_jump.py @@ -1,7 +1,7 @@ import os import numpy as np -from gym.envs.mujoco.hopper_v4 import HopperEnv +from gymnasium.envs.mujoco.hopper_v4 import HopperEnv MAX_EPISODE_STEPS_HOPPERJUMP = 250 @@ -73,7 +73,7 @@ class HopperJumpEnv(HopperEnv): self.do_simulation(action, self.frame_skip) height_after = self.get_body_com("torso")[2] - #site_pos_after = self.data.get_site_xpos('foot_site') + # site_pos_after = self.data.get_site_xpos('foot_site') site_pos_after = self.data.site('foot_site').xpos self.max_height = max(height_after, self.max_height) @@ -88,7 +88,8 @@ class HopperJumpEnv(HopperEnv): ctrl_cost = self.control_cost(action) costs = ctrl_cost - done = False + terminated = False + truncated = False goal_dist = np.linalg.norm(site_pos_after - self.goal) if self.contact_dist is None and self.contact_with_floor: @@ -115,7 +116,7 @@ class HopperJumpEnv(HopperEnv): healthy=self.is_healthy, contact_dist=self.contact_dist or 0 ) - return observation, reward, done, info + return observation, reward, terminated, truncated, info def _get_obs(self): # goal_dist = self.data.get_site_xpos('foot_site') - self.goal diff --git a/fancy_gym/envs/mujoco/hopper_jump/hopper_jump_on_box.py b/fancy_gym/envs/mujoco/hopper_jump/hopper_jump_on_box.py index f9834bd..a31e8ee 100644 --- a/fancy_gym/envs/mujoco/hopper_jump/hopper_jump_on_box.py +++ b/fancy_gym/envs/mujoco/hopper_jump/hopper_jump_on_box.py @@ -1,7 +1,9 @@ import os +from typing import Optional, Dict, Any, Tuple import numpy as np -from gym.envs.mujoco.hopper_v4 import HopperEnv +from gymnasium.core import ObsType +from gymnasium.envs.mujoco.hopper_v4 import HopperEnv MAX_EPISODE_STEPS_HOPPERJUMPONBOX = 250 @@ -74,10 +76,10 @@ class HopperJumpOnBoxEnv(HopperEnv): costs = ctrl_cost - done = fell_over or self.hopper_on_box + terminated = fell_over or self.hopper_on_box - if self.current_step >= self.max_episode_steps or done: - done = False + if self.current_step >= self.max_episode_steps or terminated: + done = False # TODO why are we doing this??? max_height = self.max_height.copy() min_distance = self.min_distance.copy() @@ -122,12 +124,13 @@ class HopperJumpOnBoxEnv(HopperEnv): 'goal': self.box_x, } - return observation, reward, done, info + return observation, reward, terminated, info def _get_obs(self): return np.append(super()._get_obs(), self.box_x) - def reset(self): + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: self.max_height = 0 self.min_distance = 5000 @@ -136,7 +139,7 @@ class HopperJumpOnBoxEnv(HopperEnv): if self.context: self.box_x = self.np_random.uniform(1, 3, 1) self.model.body("box").pos = [self.box_x[0], 0, 0] - return super().reset() + return super().reset(seed=seed, options=options) # overwrite reset_model to make it deterministic def reset_model(self): @@ -151,20 +154,5 @@ class HopperJumpOnBoxEnv(HopperEnv): observation = self._get_obs() return observation -if __name__ == '__main__': - render_mode = "human" # "human" or "partial" or "final" - env = HopperJumpOnBoxEnv() - obs = env.reset() - for i in range(2000): - # objective.load_result("/tmp/cma") - # test with random actions - ac = env.action_space.sample() - obs, rew, d, info = env.step(ac) - if i % 10 == 0: - env.render(mode=render_mode) - if d: - print('After ', i, ' steps, done: ', d) - env.reset() - env.close() \ No newline at end of file diff --git a/fancy_gym/envs/mujoco/hopper_throw/hopper_throw.py b/fancy_gym/envs/mujoco/hopper_throw/hopper_throw.py index e69cea6..ed2bf96 100644 --- a/fancy_gym/envs/mujoco/hopper_throw/hopper_throw.py +++ b/fancy_gym/envs/mujoco/hopper_throw/hopper_throw.py @@ -1,8 +1,9 @@ import os -from typing import Optional +from typing import Optional, Any, Dict, Tuple import numpy as np -from gym.envs.mujoco.hopper_v4 import HopperEnv +from gymnasium.core import ObsType +from gymnasium.envs.mujoco.hopper_v4 import HopperEnv MAX_EPISODE_STEPS_HOPPERTHROW = 250 @@ -56,14 +57,14 @@ class HopperThrowEnv(HopperEnv): # done = self.done TODO We should use this, not sure why there is no other termination; ball_landed should be enough, because we only look at the throw itself? - Paul and Marc ball_landed = bool(self.get_body_com("ball")[2] <= 0.05) - done = ball_landed + terminated = ball_landed ctrl_cost = self.control_cost(action) costs = ctrl_cost rewards = 0 - if self.current_step >= self.max_episode_steps or done: + if self.current_step >= self.max_episode_steps or terminated: distance_reward = -np.linalg.norm(ball_pos_after - self.goal) if self.context else \ self._forward_reward_weight * ball_pos_after healthy_reward = 0 if self.context else self.healthy_reward * self.current_step @@ -78,16 +79,18 @@ class HopperThrowEnv(HopperEnv): '_steps': self.current_step, 'goal': self.goal, } + truncated = False - return observation, reward, done, info + return observation, reward, terminated, truncated, info def _get_obs(self): return np.append(super()._get_obs(), self.goal) - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None): + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: self.current_step = 0 self.goal = self.goal = self.np_random.uniform(2.0, 6.0, 1) # 0.5 8.0 - return super().reset() + return super().reset(seed=seed, options=options) # overwrite reset_model to make it deterministic def reset_model(self): @@ -103,20 +106,3 @@ class HopperThrowEnv(HopperEnv): return observation -if __name__ == '__main__': - render_mode = "human" # "human" or "partial" or "final" - env = HopperThrowEnv() - obs = env.reset() - - for i in range(2000): - # objective.load_result("/tmp/cma") - # test with random actions - ac = env.action_space.sample() - obs, rew, d, info = env.step(ac) - if i % 10 == 0: - env.render(mode=render_mode) - if d: - print('After ', i, ' steps, done: ', d) - env.reset() - - env.close() diff --git a/fancy_gym/envs/mujoco/hopper_throw/hopper_throw_in_basket.py b/fancy_gym/envs/mujoco/hopper_throw/hopper_throw_in_basket.py index 76ef861..439a677 100644 --- a/fancy_gym/envs/mujoco/hopper_throw/hopper_throw_in_basket.py +++ b/fancy_gym/envs/mujoco/hopper_throw/hopper_throw_in_basket.py @@ -1,8 +1,9 @@ import os -from typing import Optional +from typing import Optional, Any, Dict, Tuple import numpy as np -from gym.envs.mujoco.hopper_v4 import HopperEnv +from gymnasium.envs.mujoco.hopper_v4 import HopperEnv +from gymnasium.core import ObsType MAX_EPISODE_STEPS_HOPPERTHROWINBASKET = 250 @@ -72,7 +73,7 @@ class HopperThrowInBasketEnv(HopperEnv): self.ball_in_basket = True ball_landed = self.get_body_com("ball")[2] <= 0.05 - done = bool(ball_landed or is_in_basket) + terminated = bool(ball_landed or is_in_basket) rewards = 0 @@ -80,7 +81,7 @@ class HopperThrowInBasketEnv(HopperEnv): costs = ctrl_cost - if self.current_step >= self.max_episode_steps or done: + if self.current_step >= self.max_episode_steps or terminated: if is_in_basket: if not self.context: @@ -101,13 +102,16 @@ class HopperThrowInBasketEnv(HopperEnv): info = { 'ball_pos': ball_pos[0], } + truncated = False - return observation, reward, done, info + return observation, reward, terminated, truncated, info def _get_obs(self): return np.append(super()._get_obs(), self.basket_x) - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None): + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: + if self.max_episode_steps == 10: # We have to initialize this here, because the spec is only added after creating the env. self.max_episode_steps = self.spec.max_episode_steps @@ -117,7 +121,7 @@ class HopperThrowInBasketEnv(HopperEnv): if self.context: self.basket_x = self.np_random.uniform(low=3, high=7, size=1) self.model.body("basket_ground").pos[:] = [self.basket_x[0], 0, 0] - return super().reset() + return super().reset(seed=seed, options=options) # overwrite reset_model to make it deterministic def reset_model(self): @@ -134,20 +138,4 @@ class HopperThrowInBasketEnv(HopperEnv): return observation -if __name__ == '__main__': - render_mode = "human" # "human" or "partial" or "final" - env = HopperThrowInBasketEnv() - obs = env.reset() - for i in range(2000): - # objective.load_result("/tmp/cma") - # test with random actions - ac = env.action_space.sample() - obs, rew, d, info = env.step(ac) - if i % 10 == 0: - env.render(mode=render_mode) - if d: - print('After ', i, ' steps, done: ', d) - env.reset() - - env.close() diff --git a/fancy_gym/envs/mujoco/reacher/reacher.py b/fancy_gym/envs/mujoco/reacher/reacher.py index ccd0073..8f5f893 100644 --- a/fancy_gym/envs/mujoco/reacher/reacher.py +++ b/fancy_gym/envs/mujoco/reacher/reacher.py @@ -1,8 +1,9 @@ import os import numpy as np -from gym import utils -from gym.envs.mujoco import MujocoEnv +from gymnasium import utils +from gymnasium.envs.mujoco import MujocoEnv +from gymnasium.spaces import Box MAX_EPISODE_STEPS_REACHER = 200 @@ -12,7 +13,17 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): More general version of the gym mujoco Reacher environment """ - def __init__(self, sparse: bool = False, n_links: int = 5, reward_weight: float = 1, ctrl_cost_weight: float = 1): + metadata = { + "render_modes": [ + "human", + "rgb_array", + "depth_array", + ], + "render_fps": 50, + } + + def __init__(self, sparse: bool = False, n_links: int = 5, reward_weight: float = 1, ctrl_cost_weight: float = 1., + **kwargs): utils.EzPickle.__init__(**locals()) self._steps = 0 @@ -25,10 +36,16 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): file_name = f'reacher_{n_links}links.xml' + # sin, cos, velocity * n_Links + goal position (2) and goal distance (3) + shape = (self.n_links * 3 + 5,) + observation_space = Box(low=-np.inf, high=np.inf, shape=shape, dtype=np.float64) + MujocoEnv.__init__(self, model_path=os.path.join(os.path.dirname(__file__), "assets", file_name), frame_skip=2, - mujoco_bindings="mujoco") + observation_space=observation_space, + **kwargs + ) def step(self, action): self._steps += 1 @@ -45,10 +62,14 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): reward = reward_dist + reward_ctrl + angular_vel self.do_simulation(action, self.frame_skip) - ob = self._get_obs() - done = False + if self.render_mode == "human": + self.render() - infos = dict( + ob = self._get_obs() + terminated = False + truncated = False + + info = dict( reward_dist=reward_dist, reward_ctrl=reward_ctrl, velocity=angular_vel, @@ -56,7 +77,7 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): goal=self.goal if hasattr(self, "goal") else None ) - return ob, reward, done, infos + return ob, reward, terminated, truncated, info def distance_reward(self): vec = self.get_body_com("fingertip") - self.get_body_com("target") @@ -66,6 +87,7 @@ class ReacherEnv(MujocoEnv, utils.EzPickle): return -10 * np.square(self.data.qvel.flat[:self.n_links]).sum() if self.sparse else 0.0 def viewer_setup(self): + assert self.viewer is not None self.viewer.cam.trackbodyid = 0 def reset_model(self): diff --git a/fancy_gym/envs/mujoco/walker_2d_jump/walker_2d_jump.py b/fancy_gym/envs/mujoco/walker_2d_jump/walker_2d_jump.py index ed663d2..cc9f2b4 100644 --- a/fancy_gym/envs/mujoco/walker_2d_jump/walker_2d_jump.py +++ b/fancy_gym/envs/mujoco/walker_2d_jump/walker_2d_jump.py @@ -1,8 +1,9 @@ import os -from typing import Optional +from typing import Optional, Any, Dict, Tuple import numpy as np -from gym.envs.mujoco.walker2d_v4 import Walker2dEnv +from gymnasium.envs.mujoco.walker2d_v4 import Walker2dEnv +from gymnasium.core import ObsType MAX_EPISODE_STEPS_WALKERJUMP = 300 @@ -54,13 +55,13 @@ class Walker2dJumpEnv(Walker2dEnv): self.max_height = max(height, self.max_height) - done = bool(height < 0.2) + terminated = bool(height < 0.2) ctrl_cost = self.control_cost(action) costs = ctrl_cost rewards = 0 - if self.current_step >= self.max_episode_steps or done: - done = True + if self.current_step >= self.max_episode_steps or terminated: + terminated = True height_goal_distance = -10 * (np.linalg.norm(self.max_height - self.goal)) healthy_reward = self.healthy_reward * self.current_step @@ -73,17 +74,19 @@ class Walker2dJumpEnv(Walker2dEnv): 'max_height': self.max_height, 'goal': self.goal, } + truncated = False - return observation, reward, done, info + return observation, reward, terminated, truncated, info def _get_obs(self): return np.append(super()._get_obs(), self.goal) - def reset(self, *, seed: Optional[int] = None, return_info: bool = False, options: Optional[dict] = None): + def reset(self, *, seed: Optional[int] = None, options: Optional[Dict[str, Any]] = None) \ + -> Tuple[ObsType, Dict[str, Any]]: self.current_step = 0 self.max_height = 0 self.goal = self.np_random.uniform(1.5, 2.5, 1) # 1.5 3.0 - return super().reset() + return super().reset(seed=seed, options=options) # overwrite reset_model to make it deterministic def reset_model(self): @@ -98,20 +101,3 @@ class Walker2dJumpEnv(Walker2dEnv): observation = self._get_obs() return observation - -if __name__ == '__main__': - render_mode = "human" # "human" or "partial" or "final" - env = Walker2dJumpEnv() - obs = env.reset() - - for i in range(6000): - # test with random actions - ac = env.action_space.sample() - obs, rew, d, info = env.step(ac) - if i % 10 == 0: - env.render(mode=render_mode) - if d: - print('After ', i, ' steps, done: ', d) - env.reset() - - env.close() diff --git a/fancy_gym/examples/examples_dmc.py b/fancy_gym/examples/examples_dmc.py index 75648b7..243bd70 100644 --- a/fancy_gym/examples/examples_dmc.py +++ b/fancy_gym/examples/examples_dmc.py @@ -26,10 +26,10 @@ def example_dmc(env_id="dmc:fish-swim", seed=1, iterations=1000, render=True): ac = env.action_space.sample() if render: env.render(mode="human") - obs, reward, done, info = env.step(ac) + obs, reward, terminated, truncated, info = env.step(ac) rewards += reward - if done: + if terminated or truncated: print(env_id, rewards) rewards = 0 obs = env.reset() @@ -102,10 +102,10 @@ def example_custom_dmc_and_mp(seed=1, iterations=1, render=True): # number of samples/full trajectories (multiple environment steps) for i in range(iterations): ac = env.action_space.sample() - obs, reward, done, info = env.step(ac) + obs, reward, terminated, truncated, info = env.step(ac) rewards += reward - if done: + if terminated or truncated: print(base_env_id, rewards) rewards = 0 obs = env.reset() diff --git a/fancy_gym/examples/examples_general.py b/fancy_gym/examples/examples_general.py index 1a89e30..383c4cf 100644 --- a/fancy_gym/examples/examples_general.py +++ b/fancy_gym/examples/examples_general.py @@ -1,6 +1,6 @@ from collections import defaultdict -import gym +import gymnasium as gym import numpy as np import fancy_gym @@ -29,13 +29,13 @@ def example_general(env_id="Pendulum-v1", seed=1, iterations=1000, render=True): # number of environment steps for i in range(iterations): - obs, reward, done, info = env.step(env.action_space.sample()) + obs, reward, terminated, truncated, info = env.step(env.action_space.sample()) rewards += reward if render: env.render() - if done: + if terminated or truncated: print(rewards) rewards = 0 obs = env.reset() @@ -69,12 +69,15 @@ def example_async(env_id="HoleReacher-v0", n_cpu=4, seed=int('533D', 16), n_samp # this would generate more samples than requested if n_samples % num_envs != 0 repeat = int(np.ceil(n_samples / env.num_envs)) for i in range(repeat): - obs, reward, done, info = env.step(env.action_space.sample()) + obs, reward, terminated, truncated, info = env.step(env.action_space.sample()) buffer['obs'].append(obs) buffer['reward'].append(reward) - buffer['done'].append(done) + buffer['terminated'].append(terminated) + buffer['truncated'].append(truncated) buffer['info'].append(info) rewards += reward + + done = terminated or truncated if np.any(done): print(f"Reward at iteration {i}: {rewards[done]}") rewards[done] = 0 diff --git a/fancy_gym/examples/examples_metaworld.py b/fancy_gym/examples/examples_metaworld.py index 0fa7066..0c38bff 100644 --- a/fancy_gym/examples/examples_metaworld.py +++ b/fancy_gym/examples/examples_metaworld.py @@ -29,9 +29,9 @@ def example_dmc(env_id="fish-swim", seed=1, iterations=1000, render=True): # THIS NEEDS TO BE SET TO FALSE FOR NOW, BECAUSE THE INTERFACE FOR RENDERING IS DIFFERENT TO BASIC GYM # TODO: Remove this, when Metaworld fixes its interface. env.render(False) - obs, reward, done, info = env.step(ac) + obs, reward, terminated, truncated, info = env.step(ac) rewards += reward - if done: + if terminated or truncated: print(env_id, rewards) rewards = 0 obs = env.reset() @@ -103,10 +103,10 @@ def example_custom_dmc_and_mp(seed=1, iterations=1, render=True): # number of samples/full trajectories (multiple environment steps) for i in range(iterations): ac = env.action_space.sample() - obs, reward, done, info = env.step(ac) + obs, reward, terminated, truncated, info = env.step(ac) rewards += reward - if done: + if terminated or truncated: print(base_env_id, rewards) rewards = 0 obs = env.reset() @@ -131,4 +131,3 @@ if __name__ == '__main__': # # # Custom MetaWorld task example_custom_dmc_and_mp(seed=10, iterations=1, render=render) - diff --git a/fancy_gym/examples/examples_movement_primitives.py b/fancy_gym/examples/examples_movement_primitives.py index 745e4e8..885cc93 100644 --- a/fancy_gym/examples/examples_movement_primitives.py +++ b/fancy_gym/examples/examples_movement_primitives.py @@ -41,11 +41,11 @@ def example_mp(env_name="HoleReacherProMP-v0", seed=1, iterations=1, render=True # This executes a full trajectory and gives back the context (obs) of the last step in the trajectory, or the # full observation space of the last step, if replanning/sub-trajectory learning is used. The 'reward' is equal # to the return of a trajectory. Default is the sum over the step-wise rewards. - obs, reward, done, info = env.step(ac) + obs, reward, terminated, truncated, info = env.step(ac) # Aggregated returns returns += reward - if done: + if terminated or truncated: print(reward) obs = env.reset() @@ -79,10 +79,10 @@ def example_custom_mp(env_name="Reacher5dProMP-v0", seed=1, iterations=1, render # number of samples/full trajectories (multiple environment steps) for i in range(iterations): ac = env.action_space.sample() - obs, reward, done, info = env.step(ac) + obs, reward, terminated, truncated, info = env.step(ac) returns += reward - if done: + if terminated or truncated: print(i, reward) obs = env.reset() @@ -145,10 +145,10 @@ def example_fully_custom_mp(seed=1, iterations=1, render=True): # number of samples/full trajectories (multiple environment steps) for i in range(iterations): ac = env.action_space.sample() - obs, reward, done, info = env.step(ac) + obs, reward, terminated, truncated, info = env.step(ac) rewards += reward - if done: + if terminated or truncated: print(rewards) rewards = 0 obs = env.reset() diff --git a/fancy_gym/examples/examples_open_ai.py b/fancy_gym/examples/examples_open_ai.py index 789271f..a79a44b 100644 --- a/fancy_gym/examples/examples_open_ai.py +++ b/fancy_gym/examples/examples_open_ai.py @@ -24,10 +24,10 @@ def example_mp(env_name, seed=1, render=True): else: env.render() ac = env.action_space.sample() - obs, reward, done, info = env.step(ac) + obs, reward, terminated, truncated, info = env.step(ac) returns += reward - if done: + if terminated or truncated: print(returns) obs = env.reset() diff --git a/fancy_gym/examples/pd_control_gain_tuning.py b/fancy_gym/examples/pd_control_gain_tuning.py index 407bfa1..4cfae39 100644 --- a/fancy_gym/examples/pd_control_gain_tuning.py +++ b/fancy_gym/examples/pd_control_gain_tuning.py @@ -34,7 +34,7 @@ fig.show() for t, pos_vel in enumerate(zip(pos, vel)): actions = env.tracking_controller.get_action(pos_vel[0], pos_vel[1], env.current_vel, env.current_pos) actions = np.clip(actions, env.env.action_space.low, env.env.action_space.high) - _, _, _, _ = env.env.step(actions) + env.env.step(actions) if t % 15 == 0: img.set_data(env.env.render(mode="rgb_array")) fig.canvas.draw() diff --git a/fancy_gym/meta/__init__.py b/fancy_gym/meta/__init__.py index 401fc44..2bcbfd8 100644 --- a/fancy_gym/meta/__init__.py +++ b/fancy_gym/meta/__init__.py @@ -1,6 +1,6 @@ from copy import deepcopy -from gym import register +from gymnasium import register from . import goal_object_change_mp_wrapper, goal_change_mp_wrapper, goal_endeffector_change_mp_wrapper, \ object_change_mp_wrapper diff --git a/fancy_gym/open_ai/__init__.py b/fancy_gym/open_ai/__init__.py index ca87c84..e4e80ee 100644 --- a/fancy_gym/open_ai/__init__.py +++ b/fancy_gym/open_ai/__init__.py @@ -1,6 +1,6 @@ from copy import deepcopy -from gym import register +from gymnasium import register from . import mujoco from .deprecated_needs_gym_robotics import robotics diff --git a/fancy_gym/utils/env_compatibility.py b/fancy_gym/utils/env_compatibility.py new file mode 100644 index 0000000..a278451 --- /dev/null +++ b/fancy_gym/utils/env_compatibility.py @@ -0,0 +1,11 @@ +import gymnasium as gym + + +class EnvCompatibility(gym.wrappers.EnvCompatibility): + def __getattr__(self, item): + """Propagate only non-existent properties to wrapped env.""" + if item.startswith('_'): + raise AttributeError("attempted to get missing private attribute '{}'".format(item)) + if item in self.__dict__: + return getattr(self, item) + return getattr(self.env, item) diff --git a/fancy_gym/utils/make_env_helpers.py b/fancy_gym/utils/make_env_helpers.py index 3c73ba9..709a4f7 100644 --- a/fancy_gym/utils/make_env_helpers.py +++ b/fancy_gym/utils/make_env_helpers.py @@ -1,17 +1,19 @@ import logging -import re import uuid from collections.abc import MutableMapping from copy import deepcopy from math import ceil -from typing import Iterable, Type, Union +from typing import Iterable, Type, Union, Optional -import gym +import gymnasium as gym import numpy as np -from gym.envs.registration import register, registry +from gymnasium.envs.registration import register, registry + +from fancy_gym.utils.env_compatibility import EnvCompatibility try: from dm_control import suite, manipulation + from shimmy.dm_control_compatibility import EnvType except ImportError: pass @@ -82,13 +84,20 @@ def make(env_id: str, seed: int, **kwargs): if framework == 'metaworld': # MetaWorld environment env = make_metaworld(env_id, seed, **kwargs) - elif framework == 'dmc': - # DeepMind Control environment - env = make_dmc(env_id, seed, **kwargs) + # elif framework == 'dmc': + # Deprecated: With shimmy gym now has native support for deepmind envs + # # DeepMind Control environment + # env = make_dmc(env_id, seed, **kwargs) else: env = make_gym(env_id, seed, **kwargs) - env.seed(seed) + # try: + env.reset(seed=seed) + # except TypeError: + # # Support for older gym envs that do not have seeding + # # env.seed(seed) + # np_random, _ = seeding.np_random(seed) + # env.np_random = np_random env.action_space.seed(seed) env.observation_space.seed(seed) @@ -158,7 +167,7 @@ def make_bb( traj_gen_kwargs['action_dim'] = traj_gen_kwargs.get('action_dim', np.prod(env.action_space.shape).item()) if black_box_kwargs.get('duration') is None: - black_box_kwargs['duration'] = env.spec.max_episode_steps * env.dt + black_box_kwargs['duration'] = get_env_duration(env) if phase_kwargs.get('tau') is None: phase_kwargs['tau'] = black_box_kwargs['duration'] @@ -186,6 +195,24 @@ def make_bb( return bb_env +def get_env_duration(env: gym.Env): + try: + duration = env.spec.max_episode_steps * env.dt + except (AttributeError, TypeError) as e: + # TODO Remove if this information is in the compatibility class + logging.error(f'Attributes env.spec.max_episode_steps and env.dt are not available. ' + f'Assuming you are using dm_control. Please make sure you have ran ' + f'"pip install shimmy[dm_control]" for that.') + if env.env_type is EnvType.COMPOSER: + max_episode_steps = ceil(env.unwrapped._time_limit / env.dt) + elif env.env_type is EnvType.RL_CONTROL: + max_episode_steps = int(env.unwrapped._step_limit) + else: + raise e + duration = max_episode_steps * env.control_timestep() + return duration + + def make_bb_env_helper(**kwargs): """ Helper function for registering a black box gym environment. @@ -235,55 +262,56 @@ def make_bb_env_helper(**kwargs): basis_kwargs=basis_kwargs, **kwargs, seed=seed) -def make_dmc( - env_id: str, - seed: int = None, - visualize_reward: bool = True, - time_limit: Union[None, float] = None, - **kwargs -): - if not re.match(r"\w+-\w+", env_id): - raise ValueError("env_id does not have the following structure: 'domain_name-task_name'") - domain_name, task_name = env_id.split("-") - - if task_name.endswith("_vision"): - # TODO - raise ValueError("The vision interface for manipulation tasks is currently not supported.") - - if (domain_name, task_name) not in suite.ALL_TASKS and task_name not in manipulation.ALL: - raise ValueError(f'Specified domain "{domain_name}" and task "{task_name}" combination does not exist.') - - # env_id = f'dmc_{domain_name}_{task_name}_{seed}-v1' - gym_id = uuid.uuid4().hex + '-v1' - - task_kwargs = {'random': seed} - if time_limit is not None: - task_kwargs['time_limit'] = time_limit - - # create task - # Accessing private attribute because DMC does not expose time_limit or step_limit. - # Only the current time_step/time as well as the control_timestep can be accessed. - if domain_name == "manipulation": - env = manipulation.load(environment_name=task_name, seed=seed) - max_episode_steps = ceil(env._time_limit / env.control_timestep()) - else: - env = suite.load(domain_name=domain_name, task_name=task_name, task_kwargs=task_kwargs, - visualize_reward=visualize_reward, environment_kwargs=kwargs) - max_episode_steps = int(env._step_limit) - - register( - id=gym_id, - entry_point='fancy_gym.dmc.dmc_wrapper:DMCWrapper', - kwargs={'env': lambda: env}, - max_episode_steps=max_episode_steps, - ) - - env = gym.make(gym_id) - env.seed(seed) - return env +# Deprecated: With shimmy gym now has native support for deepmind envs +# def make_dmc( +# env_id: str, +# seed: int = None, +# visualize_reward: bool = True, +# time_limit: Union[None, float] = None, +# **kwargs +# ): +# if not re.match(r"\w+-\w+", env_id): +# raise ValueError("env_id does not have the following structure: 'domain_name-task_name'") +# domain_name, task_name = env_id.split("-") +# +# if task_name.endswith("_vision"): +# # TODO +# raise ValueError("The vision interface for manipulation tasks is currently not supported.") +# +# if (domain_name, task_name) not in suite.ALL_TASKS and task_name not in manipulation.ALL: +# raise ValueError(f'Specified domain "{domain_name}" and task "{task_name}" combination does not exist.') +# +# # env_id = f'dmc_{domain_name}_{task_name}_{seed}-v1' +# gym_id = uuid.uuid4().hex + '-v1' +# +# task_kwargs = {'random': seed} +# if time_limit is not None: +# task_kwargs['time_limit'] = time_limit +# +# # create task +# # Accessing private attribute because DMC does not expose time_limit or step_limit. +# # Only the current time_step/time as well as the control_timestep can be accessed. +# if domain_name == "manipulation": +# env = manipulation.load(environment_name=task_name, seed=seed) +# max_episode_steps = ceil(env._time_limit / env.control_timestep()) +# else: +# env = suite.load(domain_name=domain_name, task_name=task_name, task_kwargs=task_kwargs, +# visualize_reward=visualize_reward, environment_kwargs=kwargs) +# max_episode_steps = int(env._step_limit) +# +# register( +# id=gym_id, +# entry_point='fancy_gym.dmc.dmc_wrapper:DMCWrapper', +# kwargs={'env': lambda: env}, +# max_episode_steps=max_episode_steps, +# ) +# +# env = gym.make(gym_id) +# env.seed(seed) +# return env -def make_metaworld(env_id: str, seed: int, **kwargs): +def make_metaworld(env_id: str, seed: int, render_mode: Optional[str] = None, **kwargs): if env_id not in metaworld.ML1.ENV_NAMES: raise ValueError(f'Specified environment "{env_id}" not present in metaworld ML1.') @@ -294,12 +322,17 @@ def make_metaworld(env_id: str, seed: int, **kwargs): # New argument to use global seeding _env.seeded_rand_vec = True + max_episode_steps = _env.max_path_length + + # TODO remove this as soon as there is support for the new API + _env = EnvCompatibility(_env, render_mode) + gym_id = uuid.uuid4().hex + '-v1' register( id=gym_id, entry_point=lambda: _env, - max_episode_steps=_env.max_path_length, + max_episode_steps=max_episode_steps, ) # TODO enable checker when the incorrect dtype of obs and observation space are fixed by metaworld diff --git a/fancy_gym/utils/time_aware_observation.py b/fancy_gym/utils/time_aware_observation.py index b2cbc78..192138d 100644 --- a/fancy_gym/utils/time_aware_observation.py +++ b/fancy_gym/utils/time_aware_observation.py @@ -1,45 +1,11 @@ -""" -Adapted from: https://github.com/openai/gym/blob/907b1b20dd9ac0cba5803225059b9c6673702467/gym/wrappers/time_aware_observation.py -License: MIT -Copyright (c) 2016 OpenAI (https://openai.com) - -Wrapper for adding time aware observations to environment observation. -""" -import gym +import gymnasium as gym import numpy as np -from gym.spaces import Box -class TimeAwareObservation(gym.ObservationWrapper): - """Augment the observation with the current time step in the episode. - - The observation space of the wrapped environment is assumed to be a flat :class:`Box`. - In particular, pixel observations are not supported. This wrapper will append the current timestep - within the current episode to the observation. - - Example: - >>> import gym - >>> env = gym.make('CartPole-v1') - >>> env = TimeAwareObservation(env) - >>> env.reset() - array([ 0.03810719, 0.03522411, 0.02231044, -0.01088205, 0. ]) - >>> env.step(env.action_space.sample())[0] - array([ 0.03881167, -0.16021058, 0.0220928 , 0.28875574, 1. ]) - """ +class TimeAwareObservation(gym.wrappers.TimeAwareObservation): def __init__(self, env: gym.Env): - """Initialize :class:`TimeAwareObservation` that requires an environment with a flat :class:`Box` - observation space. - - Args: - env: The environment to apply the wrapper - """ super().__init__(env) - assert isinstance(env.observation_space, Box) - low = np.append(self.observation_space.low, 0.0) - high = np.append(self.observation_space.high, 1.0) - self.observation_space = Box(low, high, dtype=self.observation_space.dtype) - self.t = 0 self._max_episode_steps = env.spec.max_episode_steps def observation(self, observation): @@ -52,27 +18,3 @@ class TimeAwareObservation(gym.ObservationWrapper): The observation with the time step appended to """ return np.append(observation, self.t / self._max_episode_steps) - - def step(self, action): - """Steps through the environment, incrementing the time step. - - Args: - action: The action to take - - Returns: - The environment's step using the action. - """ - self.t += 1 - return super().step(action) - - def reset(self, **kwargs): - """Reset the environment setting the time to zero. - - Args: - **kwargs: Kwargs to apply to env.reset() - - Returns: - The reset environment - """ - self.t = 0 - return super().reset(**kwargs) diff --git a/setup.py b/setup.py index 5720ca1..7afd526 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ extras = { ], 'box2d': ['gymnasium[box2d]>=0.26.0'], 'testing': ['pytest'], + "mujoco": ["gymnasium[mujoco]"], } # All dependencies diff --git a/test/test_dmc_envs.py b/test/test_dmc_envs.py index 410f3c1..266a12f 100644 --- a/test/test_dmc_envs.py +++ b/test/test_dmc_envs.py @@ -1,48 +1,54 @@ from itertools import chain +from typing import Callable +import gymnasium as gym import pytest from dm_control import suite, manipulation import fancy_gym from test.utils import run_env, run_env_determinism -SUITE_IDS = [f'dmc:{env}-{task}' for env, task in suite.ALL_TASKS if env != "lqr"] -MANIPULATION_IDS = [f'dmc:manipulation-{task}' for task in manipulation.ALL if task.endswith('_features')] -DMC_MP_IDS = chain(*fancy_gym.ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS.values()) +# SUITE_IDS = [f'dmc:{env}-{task}' for env, task in suite.ALL_TASKS if env != "lqr"] +# MANIPULATION_IDS = [f'dmc:manipulation-{task}' for task in manipulation.ALL if task.endswith('_features')] +DM_CONTROL_IDS = [spec.id for spec in gym.envs.registry.values() if + spec.id.startswith('dm_control/') + and 'compatibility-env-v0' not in spec.id + and 'lqr-lqr' not in spec.id] +DM_control_MP_IDS = list(chain(*fancy_gym.ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS.values())) SEED = 1 -@pytest.mark.parametrize('env_id', SUITE_IDS) -def test_step_suite_functionality(env_id: str): +@pytest.mark.parametrize('env_id', DM_CONTROL_IDS) +def test_step_dm_control_functionality(env_id: str): """Tests that suite step environments run without errors using random actions.""" - run_env(env_id) + run_env(env_id, 5000, wrappers=[gym.wrappers.FlattenObservation]) -@pytest.mark.parametrize('env_id', SUITE_IDS) -def test_step_suite_determinism(env_id: str): +@pytest.mark.parametrize('env_id', DM_CONTROL_IDS) +def test_step_dm_control_determinism(env_id: str): """Tests that for step environments identical seeds produce identical trajectories.""" - run_env_determinism(env_id, SEED) + run_env_determinism(env_id, SEED, 5000, wrappers=[gym.wrappers.FlattenObservation]) -@pytest.mark.parametrize('env_id', MANIPULATION_IDS) -def test_step_manipulation_functionality(env_id: str): - """Tests that manipulation step environments run without errors using random actions.""" - run_env(env_id) +# @pytest.mark.parametrize('env_id', MANIPULATION_IDS) +# def test_step_manipulation_functionality(env_id: str): +# """Tests that manipulation step environments run without errors using random actions.""" +# run_env(env_id) +# +# +# @pytest.mark.parametrize('env_id', MANIPULATION_IDS) +# def test_step_manipulation_determinism(env_id: str): +# """Tests that for step environments identical seeds produce identical trajectories.""" +# run_env_determinism(env_id, SEED) -@pytest.mark.parametrize('env_id', MANIPULATION_IDS) -def test_step_manipulation_determinism(env_id: str): - """Tests that for step environments identical seeds produce identical trajectories.""" - run_env_determinism(env_id, SEED) - - -@pytest.mark.parametrize('env_id', DMC_MP_IDS) +@pytest.mark.parametrize('env_id', DM_control_MP_IDS) def test_bb_dmc_functionality(env_id: str): """Tests that black box environments run without errors using random actions.""" run_env(env_id) -@pytest.mark.parametrize('env_id', DMC_MP_IDS) +@pytest.mark.parametrize('env_id', DM_control_MP_IDS) def test_bb_dmc_determinism(env_id: str): """Tests that for black box environment identical seeds produce identical trajectories.""" run_env_determinism(env_id, SEED) diff --git a/test/test_fancy_envs.py b/test/test_fancy_envs.py index 9acd696..898cc08 100644 --- a/test/test_fancy_envs.py +++ b/test/test_fancy_envs.py @@ -1,14 +1,16 @@ -import itertools +from itertools import chain +from typing import Callable import fancy_gym -import gym +import gymnasium as gym import pytest from test.utils import run_env, run_env_determinism -CUSTOM_IDS = [spec.id for spec in gym.envs.registry.all() if +CUSTOM_IDS = [id for id, spec in gym.envs.registry.items() if + not isinstance(spec.entry_point, Callable) and "fancy_gym" in spec.entry_point and 'make_bb_env_helper' not in spec.entry_point] -CUSTOM_MP_IDS = itertools.chain(*fancy_gym.ALL_FANCY_MOVEMENT_PRIMITIVE_ENVIRONMENTS.values()) +CUSTOM_MP_IDS = list(chain(*fancy_gym.ALL_FANCY_MOVEMENT_PRIMITIVE_ENVIRONMENTS.values())) SEED = 1 diff --git a/test/test_gym_envs.py b/test/test_gym_envs.py index dae5944..76b5c85 100644 --- a/test/test_gym_envs.py +++ b/test/test_gym_envs.py @@ -1,14 +1,20 @@ +import re from itertools import chain +from typing import Callable -import gym +import gymnasium as gym import pytest import fancy_gym from test.utils import run_env, run_env_determinism -GYM_IDS = [spec.id for spec in gym.envs.registry.all() if - "fancy_gym" not in spec.entry_point and 'make_bb_env_helper' not in spec.entry_point] -GYM_MP_IDS = chain(*fancy_gym.ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS.values()) +GYM_IDS = [spec.id for spec in gym.envs.registry.values() if + not isinstance(spec.entry_point, Callable) and + "fancy_gym" not in spec.entry_point and 'make_bb_env_helper' not in spec.entry_point + and 'jax' not in spec.id.lower() + and not re.match(r'GymV2.Environment', spec.id) + ] +GYM_MP_IDS = list(chain(*fancy_gym.ALL_DMC_MOVEMENT_PRIMITIVE_ENVIRONMENTS.values())) SEED = 1 diff --git a/test/test_metaworld_envs.py b/test/test_metaworld_envs.py index ed300f4..77d0c35 100644 --- a/test/test_metaworld_envs.py +++ b/test/test_metaworld_envs.py @@ -8,7 +8,11 @@ from test.utils import run_env, run_env_determinism METAWORLD_IDS = [f'metaworld:{env.split("-goal-observable")[0]}' for env, _ in ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE.items()] +<<<<<<< HEAD METAWORLD_MP_IDS = chain(*fancy_gym.ALL_METAWORLD_MOVEMENT_PRIMITIVE_ENVIRONMENTS.values()) +======= +METAWORLD_MP_IDS = list(chain(*fancy_gym.ALL_METAWORLD_MOVEMENT_PRIMITIVE_ENVIRONMENTS.values())) +>>>>>>> 47-update-to-new-gym-api SEED = 1 diff --git a/test/utils.py b/test/utils.py index dff2292..0855f37 100644 --- a/test/utils.py +++ b/test/utils.py @@ -1,9 +1,12 @@ -import gym +from typing import List, Type + +import gymnasium as gym import numpy as np from fancy_gym import make -def run_env(env_id, iterations=None, seed=0, render=False): +def run_env(env_id: str, iterations: int = None, seed: int = 0, wrappers: List[Type[gym.Wrapper]] = [], + render: bool = False): """ Example for running a DMC based env in the step based setting. The env_id has to be specified as `dmc:domain_name-task_name` or @@ -13,17 +16,21 @@ def run_env(env_id, iterations=None, seed=0, render=False): env_id: Either `dmc:domain_name-task_name` or `dmc:manipulation-environment_name` iterations: Number of rollout steps to run seed: random seeding + wrappers: List of Wrappers to apply to the environment render: Render the episode - Returns: observations, rewards, dones, actions + Returns: observations, rewards, terminations, truncations, actions """ env: gym.Env = make(env_id, seed=seed) + for w in wrappers: + env = w(env) rewards = [] observations = [] actions = [] - dones = [] - obs = env.reset() + terminations = [] + truncations = [] + obs, _ = env.reset() verify_observations(obs, env.observation_space, "reset()") iterations = iterations or (env.spec.max_episode_steps or 1) @@ -35,38 +42,49 @@ def run_env(env_id, iterations=None, seed=0, render=False): ac = env.action_space.sample() actions.append(ac) # ac = np.random.uniform(env.action_space.low, env.action_space.high, env.action_space.shape) - obs, reward, done, info = env.step(ac) + obs, reward, terminated, truncated, info = env.step(ac) verify_observations(obs, env.observation_space, "step()") verify_reward(reward) - verify_done(done) + verify_done(terminated) + verify_done(truncated) rewards.append(reward) - dones.append(done) + terminations.append(terminated) + truncations.append(truncated) if render: env.render("human") - if done: + if terminated or truncated: break if not hasattr(env, "replanning_schedule"): - assert done, "Done flag is not True after end of episode." + assert terminated or truncated, f"Termination or truncation flag is not True after {i + 1} iterations." + observations.append(obs) env.close() del env - return np.array(observations), np.array(rewards), np.array(dones), np.array(actions) + return np.array(observations), np.array(rewards), np.array(terminations), np.array(truncations), np.array(actions) -def run_env_determinism(env_id: str, seed: int): - traj1 = run_env(env_id, seed=seed) - traj2 = run_env(env_id, seed=seed) +def run_env_determinism(env_id: str, seed: int, iterations: int = None, wrappers: List[Type[gym.Wrapper]] = []): + traj1 = run_env(env_id, iterations=iterations, + seed=seed, wrappers=wrappers) + traj2 = run_env(env_id, iterations=iterations, + seed=seed, wrappers=wrappers) # Iterate over two trajectories, which should have the same state and action sequence for i, time_step in enumerate(zip(*traj1, *traj2)): - obs1, rwd1, done1, ac1, obs2, rwd2, done2, ac2 = time_step - assert np.array_equal(obs1, obs2), f"Observations [{i}] {obs1} and {obs2} do not match." - assert np.array_equal(ac1, ac2), f"Actions [{i}] {ac1} and {ac2} do not match." - assert np.array_equal(rwd1, rwd2), f"Rewards [{i}] {rwd1} and {rwd2} do not match." - assert np.array_equal(done1, done2), f"Dones [{i}] {done1} and {done2} do not match." + obs1, rwd1, term1, trunc1, ac1, obs2, rwd2, term2, trunc2, ac2 = time_step + assert np.allclose( + obs1, obs2), f"Observations [{i}] {obs1} and {obs2} do not match." + assert np.array_equal( + ac1, ac2), f"Actions [{i}] {ac1} and {ac2} do not match." + assert np.array_equal( + rwd1, rwd2), f"Rewards [{i}] {rwd1} and {rwd2} do not match." + assert np.array_equal( + term1, term2), f"Terminateds [{i}] {term1} and {term2} do not match." + assert np.array_equal( + term1, term2), f"Truncateds [{i}] {trunc1} and {trunc2} do not match." def verify_observations(obs, observation_space: gym.Space, obs_type="reset()"): @@ -75,8 +93,10 @@ def verify_observations(obs, observation_space: gym.Space, obs_type="reset()"): def verify_reward(reward): - assert isinstance(reward, (float, int)), f"Returned type {type(reward)} as reward, expected float or int." + assert isinstance( + reward, (float, int)), f"Returned type {type(reward)} as reward, expected float or int." def verify_done(done): - assert isinstance(done, bool), f"Returned {done} as done flag, expected bool." + assert isinstance( + done, bool), f"Returned {done} as done flag, expected bool."