From 9ad6fbe712e66e4815adc22357132aeb3d4c28e5 Mon Sep 17 00:00:00 2001 From: Fabian Otto Date: Tue, 28 Jun 2022 16:05:09 +0200 Subject: [PATCH] first clean up and some non working ideas sketched --- README.md | 38 +++--- .../alr/mujoco/ball_in_a_cup/ball_in_a_cup.py | 15 +-- .../ball_in_a_cup_reward_simple.py | 2 +- alr_envs/alr/mujoco/beerpong/beerpong.py | 114 ++++++---------- .../mujoco/beerpong/beerpong_reward_staged.py | 48 +++---- .../alr/mujoco/beerpong/new_mp_wrapper.py | 42 ++---- .../alr/mujoco/hopper_jump/hopper_jump.py | 123 +++++++++--------- alr_envs/mp/episodic_wrapper.py | 67 ++++++---- alr_envs/utils/__init__.py | 9 +- alr_envs/utils/make_env_helpers.py | 27 ++-- 10 files changed, 225 insertions(+), 260 deletions(-) diff --git a/README.md b/README.md index edd1aac..c08a1d4 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,32 @@ ## ALR Robotics Control Environments This project offers a large variety of reinforcement learning environments under the unifying interface of [OpenAI gym](https://gym.openai.com/). -Besides, we also provide support (under the OpenAI interface) for the benchmark suites +We provide support (under the OpenAI interface) for the benchmark suites [DeepMind Control](https://deepmind.com/research/publications/2020/dm-control-Software-and-Tasks-for-Continuous-Control) -(DMC) and [Metaworld](https://meta-world.github.io/). Custom (Mujoco) gym environments can be created according -to [this guide](https://github.com/openai/gym/blob/master/docs/creating-environments.md). Unlike existing libraries, we -additionally support to control agents with Dynamic Movement Primitives (DMPs) and Probabilistic Movement Primitives (ProMP, -we only consider the mean usually). +(DMC) and [Metaworld](https://meta-world.github.io/). +Custom (Mujoco) gym environments can be created according +to [this guide](https://www.gymlibrary.ml/content/environment_creation/). +Unlike existing libraries, we additionally support to control agents with movement primitives, such as +Dynamic Movement Primitives (DMPs) and Probabilistic Movement Primitives (ProMP, we only consider the mean usually). -## Motion Primitive Environments (Episodic environments) +## Movement Primitive Environments (Episode-Based/Black-Box Environments) -Unlike step-based environments, motion primitive (MP) environments are closer related to stochastic search, black box -optimization, and methods that are often used in robotics. MP environments are trajectory-based and always execute a full -trajectory, which is generated by a Dynamic Motion Primitive (DMP) or a Probabilistic Motion Primitive (ProMP). The -generated trajectory is translated into individual step-wise actions by a controller. The exact choice of controller is, -however, dependent on the type of environment. We currently support position, velocity, and PD-Controllers for position, -velocity, and torque control, respectively. The goal of all MP environments is still to learn a policy. Yet, an action +Unlike step-based environments, movement primitive (MP) environments are closer related to stochastic search, black-box +optimization, and methods that are often used in traditional robotics and control. +MP environments are episode-based and always execute a full trajectory, which is generated by a trajectory generator, +such as a Dynamic Movement Primitive (DMP) or a Probabilistic Movement Primitive (ProMP). +The generated trajectory is translated into individual step-wise actions by a trajectory tracking controller. +The exact choice of controller is, however, dependent on the type of environment. +We currently support position, velocity, and PD-Controllers for position, velocity, and torque control, respectively +as well as a special controller for the MetaWorld control suite. +The goal of all MP environments is still to learn a optimal policy. Yet, an action represents the parametrization of the motion primitives to generate a suitable trajectory. Additionally, in this -framework we support all of this also for the contextual setting, for which we expose all changing substates of the -task as a single observation in the beginning. This requires to predict a new action/MP parametrization for each -trajectory. All environments provide next to the cumulative episode reward all collected information from each -step as part of the info dictionary. This information should, however, mainly be used for debugging and logging. +framework we support all of this also for the contextual setting, i.e. we expose a subset of the observation space +as a single context in the beginning of the episode. This requires to predict a new action/MP parametrization for each +context. +All environments provide next to the cumulative episode reward all collected information from each +step as part of the info dictionary. This information is, however, mainly meant for debugging as well as logging +and not for training. |Key| Description| |---|---| diff --git a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup.py b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup.py index 7cff670..7b286bc 100644 --- a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup.py +++ b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup.py @@ -4,10 +4,11 @@ import numpy as np from gym.envs.mujoco import MujocoEnv - class ALRBallInACupEnv(MujocoEnv, utils.EzPickle): - def __init__(self, n_substeps=4, apply_gravity_comp=True, simplified: bool = False, - reward_type: str = None, context: np.ndarray = None): + def __init__( + self, n_substeps=4, apply_gravity_comp=True, simplified: bool = False, + reward_type: str = None, context: np.ndarray = None + ): utils.EzPickle.__init__(**locals()) self._steps = 0 @@ -23,9 +24,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle): self.context = context - alr_mujoco_env.AlrMujocoEnv.__init__(self, - self.xml_path, - apply_gravity_comp=apply_gravity_comp, + alr_mujoco_env.AlrMujocoEnv.__init__(self, self.xml_path, apply_gravity_comp=apply_gravity_comp, n_substeps=n_substeps) self._start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57]) self._start_vel = np.zeros(7) @@ -129,7 +128,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle): np.sin(theta), # self.get_body_com("target"), # only return target to make problem harder [self._steps], - ]) + ]) # TODO @property @@ -139,7 +138,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle): [False] * 7, # sin # [True] * 2, # x-y coordinates of target distance [False] # env steps - ]) + ]) # These functions are for the task with 3 joint actuations def extend_des_pos(self, des_pos): diff --git a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py index a147d89..0762e22 100644 --- a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py +++ b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py @@ -67,7 +67,7 @@ class BallInACupReward(alr_reward_fct.AlrReward): action_cost = np.sum(np.square(action)) self.action_costs.append(action_cost) - self._is_collided = self.check_collision(self.env.sim) or self.env.check_traj_in_joint_limits() + self._is_collided = self.check_collision(self.env.sim) or self.env._check_traj_in_joint_limits() if self.env._steps == self.env.ep_length - 1 or self._is_collided: t_min_dist = np.argmin(self.dists) diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py index 5040ec7..b7d376e 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong.py @@ -1,11 +1,11 @@ -import mujoco_py.builder import os +import mujoco_py.builder import numpy as np -from gym import utils, spaces +from gym import utils from gym.envs.mujoco import MujocoEnv -from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward +from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward CUP_POS_MIN = np.array([-1.42, -4.05]) CUP_POS_MAX = np.array([1.42, -1.25]) @@ -18,12 +18,14 @@ CUP_POS_MAX = np.array([1.42, -1.25]) # CUP_POS_MIN = np.array([-0.16, -2.2]) # CUP_POS_MAX = np.array([0.16, -1.7]) - class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): - def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, - rndm_goal=False, cup_goal_pos=None): + def __init__( + self, frame_skip=1, apply_gravity_comp=True, noisy=False, + rndm_goal=False, cup_goal_pos=None + ): + cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840]) - if cup_goal_pos.shape[0]==2: + if cup_goal_pos.shape[0] == 2: cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840) self.cup_goal_pos = np.array(cup_goal_pos) @@ -50,7 +52,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): # self._release_step = 130 # time step of ball release self.release_step = 100 # time step of ball release - self.ep_length = 600//frame_skip + self.ep_length = 600 // frame_skip self.cup_table_id = 10 if noisy: @@ -71,14 +73,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): def start_vel(self): return self._start_vel - @property - def current_pos(self): - return self.sim.data.qpos[0:7].copy() - - @property - def current_vel(self): - return self.sim.data.qvel[0:7].copy() - def reset(self): self.reward_function.reset(self.add_noise) return super().reset() @@ -122,7 +116,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): self.sim.data.qpos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy() self.sim.data.qvel[7::] = self.sim.data.site_xvelp[self.ball_site_id, :].copy() elif self._steps == self.release_step and self.add_noise: - self.sim.data.qvel[7::] += self.noise_std * np.random.randn(3) + self.sim.data.qvel[7::] += self.noise_std * np.random.randn(3) crash = False except mujoco_py.builder.MujocoException: crash = True @@ -147,29 +141,32 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): ball_pos = np.zeros(3) ball_vel = np.zeros(3) - infos = dict(reward_dist=reward_dist, - reward=reward, - velocity=angular_vel, - # traj=self._q_pos, - action=a, - q_pos=self.sim.data.qpos[0:7].ravel().copy(), - q_vel=self.sim.data.qvel[0:7].ravel().copy(), - ball_pos=ball_pos, - ball_vel=ball_vel, - success=success, - is_collided=is_collided, sim_crash=crash, - table_contact_first=int(not self.reward_function.ball_ground_contact_first)) + infos = dict( + reward_dist=reward_dist, + reward=reward, + velocity=angular_vel, + # traj=self._q_pos, + action=a, + q_pos=self.sim.data.qpos[0:7].ravel().copy(), + q_vel=self.sim.data.qvel[0:7].ravel().copy(), + ball_pos=ball_pos, + ball_vel=ball_vel, + success=success, + is_collided=is_collided, sim_crash=crash, + table_contact_first=int(not self.reward_function.ball_ground_contact_first) + ) infos.update(reward_infos) return ob, reward, done, infos - def check_traj_in_joint_limits(self): + def _check_traj_in_joint_limits(self): return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min) def _get_obs(self): theta = self.sim.data.qpos.flat[:7] theta_dot = self.sim.data.qvel.flat[:7] ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy() - cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_final_table"]].copy() + cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[ + self.sim.model._site_name2id["cup_goal_final_table"]].copy() cup_goal_diff_top = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_table"]].copy() return np.concatenate([ np.cos(theta), @@ -179,17 +176,21 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): cup_goal_diff_top, self.sim.model.body_pos[self.cup_table_id][:2].copy(), [self._steps], - ]) + ]) + + def compute_reward(self): @property def dt(self): - return super(ALRBeerBongEnv, self).dt*self.repeat_action + return super(ALRBeerBongEnv, self).dt * self.repeat_action + class ALRBeerBongEnvFixedReleaseStep(ALRBeerBongEnv): def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) self.release_step = 62 # empirically evaluated for frame_skip=2! + class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv): def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) @@ -202,54 +203,21 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv): reward = 0 done = False while not done: - sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape)) + sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step( + np.zeros(a.shape)) reward += sub_reward infos = sub_infos ob = sub_ob - ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the - # internal steps and thus, the observation also needs to be set correctly + ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the + # internal steps and thus, the observation also needs to be set correctly return ob, reward, done, infos -# class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv): -# def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): -# super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) -# self.release_step = 62 # empirically evaluated for frame_skip=2! -# -# def step(self, a): -# if self._steps < self.release_step: -# return super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(a) -# else: -# sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape)) -# reward = sub_reward -# infos = sub_infos -# ob = sub_ob -# ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the -# # internal steps and thus, the observation also needs to be set correctly -# return ob, reward, done, infos - - class ALRBeerBongEnvStepBased(ALRBeerBongEnv): def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) self.release_step = 62 # empirically evaluated for frame_skip=2! - # def _set_action_space(self): - # bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space() - # min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype) - # max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype) - # self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype) - # return self.action_space - - # def step(self, a): - # self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step - # return super(ALRBeerBongEnvStepBased, self).step(a[1:]) - # - # def reset(self): - # ob = super(ALRBeerBongEnvStepBased, self).reset() - # self.release_step = self.ep_length + 1 - # return ob - def step(self, a): if self._steps < self.release_step: return super(ALRBeerBongEnvStepBased, self).step(a) @@ -267,9 +235,9 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv): cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[ self.sim.model._site_name2id["cup_goal_table"]].copy()) if sub_infos['success']: - dist_rew = -cup_goal_dist_final**2 + dist_rew = -cup_goal_dist_final ** 2 else: - dist_rew = -0.5*cup_goal_dist_final**2 - cup_goal_dist_top**2 + dist_rew = -0.5 * cup_goal_dist_final ** 2 - cup_goal_dist_top ** 2 reward = reward - sub_infos['action_cost'] + dist_rew infos = sub_infos ob = sub_ob @@ -278,13 +246,13 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv): return ob, reward, done, infos - if __name__ == "__main__": # env = ALRBeerBongEnv(rndm_goal=True) # env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True) # env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True) env = ALRBeerBongEnvFixedReleaseStep(frame_skip=2, rndm_goal=True) import time + env.reset() env.render("human") for i in range(1500): diff --git a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py index 4629461..7e5fda6 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py @@ -29,52 +29,38 @@ class BeerPongReward: # "cup_base_table" ] - - self.ball_traj = None self.dists = None self.dists_final = None - self.costs = None self.action_costs = None - self.angle_rewards = None - self.cup_angles = None - self.cup_z_axes = None - self.collision_penalty = 500 - self.reset(None) + self.reset() self.is_initialized = False - def reset(self, noisy): - self.ball_traj = [] + def reset(self): self.dists = [] self.dists_final = [] - self.costs = [] self.action_costs = [] - self.angle_rewards = [] - self.cup_angles = [] - self.cup_z_axes = [] self.ball_ground_contact_first = False self.ball_table_contact = False self.ball_wall_contact = False self.ball_cup_contact = False self.ball_in_cup = False - self.dist_ground_cup = -1 # distance floor to cup if first floor contact - self.noisy_bp = noisy - self._t_min_final_dist = -1 + self.dist_ground_cup = -1 # distance floor to cup if first floor contact def initialize(self, env): if not self.is_initialized: self.is_initialized = True self.ball_id = env.sim.model._body_name2id["ball"] - self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"] self.goal_id = env.sim.model._site_name2id["cup_goal_table"] self.goal_final_id = env.sim.model._site_name2id["cup_goal_final_table"] - self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects] self.cup_table_id = env.sim.model._body_name2id["cup_table"] + + self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"] self.table_collision_id = env.sim.model._geom_name2id["table_contact_geom"] self.wall_collision_id = env.sim.model._geom_name2id["wall"] self.cup_table_collision_id = env.sim.model._geom_name2id["cup_base_table_contact"] - self.init_ball_pos_site_id = env.sim.model._site_name2id["init_ball_pos_site"] self.ground_collision_id = env.sim.model._geom_name2id["ground"] + self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects] self.robot_collision_ids = [env.sim.model._geom_name2id[name] for name in self.robot_collision_objects] def compute_reward(self, env, action): @@ -87,7 +73,7 @@ class BeerPongReward: self.check_contacts(env.sim) self.dists.append(np.linalg.norm(goal_pos - ball_pos)) self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) - self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \ + self.dist_ground_cup = np.linalg.norm(ball_pos - goal_pos) \ if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup action_cost = np.sum(np.square(action)) self.action_costs.append(np.copy(action_cost)) @@ -99,7 +85,7 @@ class BeerPongReward: min_dist = np.min(self.dists) final_dist = self.dists_final[-1] if self.ball_ground_contact_first: - min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 2, -4 # relative rew offset when first bounding on ground + min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 2, -4 # relative rew offset when first bounding on ground # min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -6 # absolute rew offset when first bouncing on ground else: if not self.ball_in_cup: @@ -108,18 +94,18 @@ class BeerPongReward: else: min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -2 else: - min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0 + min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0, 0 # dist_ground_cup = 1 * self.dist_ground_cup action_cost = 1e-4 * np.mean(action_cost) reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \ - action_cost - ground_contact_dist_coeff*self.dist_ground_cup ** 2 + action_cost - ground_contact_dist_coeff * self.dist_ground_cup ** 2 # 1e-7*np.mean(action_cost) # release step punishment min_time_bound = 0.1 max_time_bound = 1.0 - release_time = env.release_step*env.dt - release_time_rew = int(release_timemax_time_bound)*(-30-10*(release_time-max_time_bound)**2) + release_time = env.release_step * env.dt + release_time_rew = int(release_time < min_time_bound) * (-30 - 10 * (release_time - min_time_bound) ** 2) \ + + int(release_time > max_time_bound) * (-30 - 10 * (release_time - max_time_bound) ** 2) reward += release_time_rew success = self.ball_in_cup # print('release time :', release_time) @@ -147,17 +133,17 @@ class BeerPongReward: self.table_collision_id) if not self.ball_cup_contact: self.ball_cup_contact = self._check_collision_with_set_of_objects(sim, self.ball_collision_id, - self.cup_collision_ids) + self.cup_collision_ids) if not self.ball_wall_contact: self.ball_wall_contact = self._check_collision_single_objects(sim, self.ball_collision_id, - self.wall_collision_id) + self.wall_collision_id) if not self.ball_in_cup: self.ball_in_cup = self._check_collision_single_objects(sim, self.ball_collision_id, self.cup_table_collision_id) if not self.ball_ground_contact_first: if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact and not self.ball_in_cup: self.ball_ground_contact_first = self._check_collision_single_objects(sim, self.ball_collision_id, - self.ground_collision_id) + self.ground_collision_id) def _check_collision_single_objects(self, sim, id_1, id_2): for coni in range(0, sim.data.ncon): @@ -190,4 +176,4 @@ class BeerPongReward: if collision or collision_trans: return True - return False \ No newline at end of file + return False diff --git a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py index 2a2d4f9..53d7a1a 100644 --- a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py +++ b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py @@ -1,10 +1,15 @@ -from alr_envs.mp.episodic_wrapper import EpisodicWrapper -from typing import Union, Tuple +from typing import Tuple, Union + import numpy as np -import gym + +from alr_envs.mp.episodic_wrapper import EpisodicWrapper class NewMPWrapper(EpisodicWrapper): + + # def __init__(self, replanning_model): + # self.replanning_model = replanning_model + @property def current_pos(self) -> Union[float, int, np.ndarray, Tuple]: return self.env.sim.data.qpos[0:7].copy() @@ -22,26 +27,16 @@ class NewMPWrapper(EpisodicWrapper): [False] * 3, # cup_goal_diff_top [True] * 2, # xy position of cup [False] # env steps - ]) + ]) - # def set_mp_action_space(self): - # min_action_bounds, max_action_bounds = self.mp.get_param_bounds() - # if self.mp.learn_tau: - # min_action_bounds[0] = 20*self.env.dt - # max_action_bounds[0] = 260*self.env.dt - # mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(), - # dtype=np.float32) - # return mp_action_space - - # def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]: - # if self.mp.learn_tau: - # return np.concatenate((step_action, np.atleast_1d(env_spec_params))) - # else: - # return step_action + def do_replanning(self, pos, vel, s, a, t, last_replan_step): + return False + # const = np.arange(0, 1000, 10) + # return bool(self.replanning_model(s)) def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]: if self.mp.learn_tau: - self.env.env.release_step = action[0]/self.env.dt # Tau value + self.env.env.release_step = action[0] / self.env.dt # Tau value return action, None else: return action, None @@ -52,12 +47,3 @@ class NewMPWrapper(EpisodicWrapper): xyz[-1] = 0.840 self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz return self.get_observation_from_step(self.env.env._get_obs()) - # def set_action_space(self): - # if self.mp.learn_tau: - # min_action_bounds, max_action_bounds = self.mp.get_param_bounds() - # min_action_bounds = np.concatenate((min_action_bounds.numpy(), [self.env.action_space.low[-1]])) - # max_action_bounds = np.concatenate((max_action_bounds.numpy(), [self.env.action_space.high[-1]])) - # self.action_space = gym.spaces.Box(low=min_action_bounds, high=max_action_bounds, dtype=np.float32) - # return self.action_space - # else: - # return super(NewMPWrapper, self).set_action_space() diff --git a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py index 20ede40..146b039 100644 --- a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py +++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py @@ -14,20 +14,23 @@ class ALRHopperJumpEnv(HopperEnv): - exclude current positions from observatiosn is set to False """ - def __init__(self, - xml_file='hopper_jump.xml', - forward_reward_weight=1.0, - ctrl_cost_weight=1e-3, - healthy_reward=0.0, - penalty=0.0, - context=True, - terminate_when_unhealthy=False, - healthy_state_range=(-100.0, 100.0), - healthy_z_range=(0.5, float('inf')), - healthy_angle_range=(-float('inf'), float('inf')), - reset_noise_scale=5e-3, - exclude_current_positions_from_observation=False, - max_episode_steps=250): + def __init__( + self, + xml_file='hopper_jump.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=0.0, + penalty=0.0, + context=True, + terminate_when_unhealthy=False, + healthy_state_range=(-100.0, 100.0), + healthy_z_range=(0.5, float('inf')), + healthy_angle_range=(-float('inf'), float('inf')), + reset_noise_scale=5e-3, + exclude_current_positions_from_observation=False, + max_episode_steps=250 + ): + self.current_step = 0 self.max_height = 0 self.max_episode_steps = max_episode_steps @@ -47,7 +50,7 @@ class ALRHopperJumpEnv(HopperEnv): exclude_current_positions_from_observation) def step(self, action): - + self.current_step += 1 self.do_simulation(action, self.frame_skip) height_after = self.get_body_com("torso")[2] @@ -59,22 +62,14 @@ class ALRHopperJumpEnv(HopperEnv): done = False rewards = 0 if self.current_step >= self.max_episode_steps: - hight_goal_distance = -10*np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height - healthy_reward = 0 if self.context else self.healthy_reward * 2 # self.current_step - height_reward = self._forward_reward_weight * hight_goal_distance # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley + hight_goal_distance = -10 * np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height + healthy_reward = 0 if self.context else self.healthy_reward * 2 # self.current_step + height_reward = self._forward_reward_weight * hight_goal_distance # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley rewards = height_reward + healthy_reward - # else: - # # penalty for wrong start direction of first two joints; not needed, could be removed - # rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0 - observation = self._get_obs() reward = rewards - costs - # info = { - # 'height' : height_after, - # 'max_height': self.max_height, - # 'goal' : self.goal - # } + info = { 'height': height_after, 'x_pos': site_pos_after, @@ -82,7 +77,7 @@ class ALRHopperJumpEnv(HopperEnv): 'height_rew': self.max_height, 'healthy_reward': self.healthy_reward * 2, 'healthy': self.is_healthy - } + } return observation, reward, done, info @@ -90,7 +85,7 @@ class ALRHopperJumpEnv(HopperEnv): return np.append(super()._get_obs(), self.goal) def reset(self): - self.goal = self.np_random.uniform(1.4, 2.16, 1)[0] # 1.3 2.3 + self.goal = self.np_random.uniform(1.4, 2.16, 1)[0] # 1.3 2.3 self.max_height = 0 self.current_step = 0 return super().reset() @@ -98,8 +93,8 @@ class ALRHopperJumpEnv(HopperEnv): # overwrite reset_model to make it deterministic def reset_model(self): - qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) - qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) self.set_state(qpos, qvel) @@ -147,7 +142,7 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv): self.contact_with_floor = floor_contact if self.contact_dist is None and self.contact_with_floor: - self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')] + self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')] - np.array([self.goal, 0, 0])) ctrl_cost = self.control_cost(action) @@ -157,9 +152,9 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv): rewards = 0 if self.current_step >= self.max_episode_steps: # healthy_reward = 0 if self.context else self.healthy_reward * self.current_step - healthy_reward = self.healthy_reward * 2 #* self.current_step + healthy_reward = self.healthy_reward * 2 # * self.current_step contact_dist = self.contact_dist if self.contact_dist is not None else 5 - dist_reward = self._forward_reward_weight * (-3*goal_dist + 10*self.max_height - 2*contact_dist) + dist_reward = self._forward_reward_weight * (-3 * goal_dist + 10 * self.max_height - 2 * contact_dist) rewards = dist_reward + healthy_reward observation = self._get_obs() @@ -174,7 +169,7 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv): 'healthy_reward': self.healthy_reward * 2, 'healthy': self.is_healthy, 'contact_dist': self.contact_dist if self.contact_dist is not None else 0 - } + } return observation, reward, done, info def reset_model(self): @@ -225,25 +220,28 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv): self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0]) return self._get_obs() + class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): - def __init__(self, - xml_file='hopper_jump.xml', - forward_reward_weight=1.0, - ctrl_cost_weight=1e-3, - healthy_reward=0.0, - penalty=0.0, - context=True, - terminate_when_unhealthy=False, - healthy_state_range=(-100.0, 100.0), - healthy_z_range=(0.5, float('inf')), - healthy_angle_range=(-float('inf'), float('inf')), - reset_noise_scale=5e-3, - exclude_current_positions_from_observation=False, - max_episode_steps=250, - height_scale = 10, - dist_scale = 3, - healthy_scale = 2): + def __init__( + self, + xml_file='hopper_jump.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=0.0, + penalty=0.0, + context=True, + terminate_when_unhealthy=False, + healthy_state_range=(-100.0, 100.0), + healthy_z_range=(0.5, float('inf')), + healthy_angle_range=(-float('inf'), float('inf')), + reset_noise_scale=5e-3, + exclude_current_positions_from_observation=False, + max_episode_steps=250, + height_scale=10, + dist_scale=3, + healthy_scale=2 + ): self.height_scale = height_scale self.dist_scale = dist_scale self.healthy_scale = healthy_scale @@ -263,15 +261,14 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): ctrl_cost = self.control_cost(action) healthy_reward = self.healthy_reward * self.healthy_scale - height_reward = self.height_scale*height_after + height_reward = self.height_scale * height_after goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0] - goal_dist_reward = -self.dist_scale*goal_dist + goal_dist_reward = -self.dist_scale * goal_dist dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward) reward = -ctrl_cost + healthy_reward + dist_reward done = False observation = self._get_obs() - ########################################################### # This is only for logging the distance to goal when first having the contact ########################################################## @@ -297,9 +294,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): 'healthy_reward': self.healthy_reward * self.healthy_reward, 'healthy': self.is_healthy, 'contact_dist': self.contact_dist if self.contact_dist is not None else 0 - } + } return observation, reward, done, info + class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv): def __init__(self, max_episode_steps=250): super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False, @@ -309,14 +307,14 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv): def reset_model(self): self._floor_geom_id = self.model.geom_name2id('floor') self._foot_geom_id = self.model.geom_name2id('foot_geom') - noise_low = -np.ones(self.model.nq)*self._reset_noise_scale + noise_low = -np.ones(self.model.nq) * self._reset_noise_scale noise_low[1] = 0 noise_low[2] = 0 noise_low[3] = -0.2 noise_low[4] = -0.2 noise_low[5] = -0.1 - noise_high = np.ones(self.model.nq)*self._reset_noise_scale + noise_high = np.ones(self.model.nq) * self._reset_noise_scale noise_high[1] = 0 noise_high[2] = 0 noise_high[3] = 0 @@ -325,7 +323,7 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv): rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) # rnd_vec[2] *= 0.05 # the angle around the y axis shouldn't be too high as the agent then falls down quickly and - # can not recover + # can not recover # rnd_vec[1] = np.clip(rnd_vec[1], 0, 0.3) qpos = self.init_qpos + rnd_vec qvel = self.init_qvel @@ -341,7 +339,7 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv): self.do_simulation(action, self.frame_skip) self.contact_with_floor = self._contact_checker(self._floor_geom_id, self._foot_geom_id) if not \ - self.contact_with_floor else True + self.contact_with_floor else True height_after = self.get_body_com("torso")[2] self.max_height = max(height_after, self.max_height) if self.contact_with_floor else 0 @@ -365,12 +363,11 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv): 'height': height_after, 'max_height': self.max_height, 'goal': self.goal - } + } return observation, reward, done, info - if __name__ == '__main__': render_mode = "human" # "human" or "partial" or "final" # env = ALRHopperJumpEnv() @@ -396,4 +393,4 @@ if __name__ == '__main__': print('After ', i, ' steps, done: ', d) env.reset() - env.close() \ No newline at end of file + env.close() diff --git a/alr_envs/mp/episodic_wrapper.py b/alr_envs/mp/episodic_wrapper.py index 456c0c9..b2eb391 100644 --- a/alr_envs/mp/episodic_wrapper.py +++ b/alr_envs/mp/episodic_wrapper.py @@ -9,7 +9,7 @@ from mp_pytorch.mp.mp_interfaces import MPInterface from alr_envs.mp.controllers.base_controller import BaseController -class EpisodicWrapper(gym.Env, ABC): +class EpisodicWrapper(gym.Env, gym.wrappers.TransformReward, ABC): """ Base class for movement primitive based gym.Wrapper implementations. @@ -22,14 +22,19 @@ class EpisodicWrapper(gym.Env, ABC): weight_scale: Scaling parameter for the actions given to this wrapper render_mode: Equivalent to gym render mode """ - def __init__(self, - env: gym.Env, - mp: MPInterface, - controller: BaseController, - duration: float, - render_mode: str = None, - verbose: int = 1, - weight_scale: float = 1): + + def __init__( + self, + env: gym.Env, + mp: MPInterface, + controller: BaseController, + duration: float, + render_mode: str = None, + verbose: int = 1, + weight_scale: float = 1, + sequencing=True, + reward_aggregation=np.mean, + ): super().__init__() self.env = env @@ -40,11 +45,11 @@ class EpisodicWrapper(gym.Env, ABC): self.duration = duration self.traj_steps = int(duration / self.dt) self.post_traj_steps = self.env.spec.max_episode_steps - self.traj_steps + # duration = self.env.max_episode_steps * self.dt - self.controller = controller self.mp = mp self.env = env - self.verbose = verbose + self.controller = controller self.weight_scale = weight_scale # rendering @@ -52,15 +57,18 @@ class EpisodicWrapper(gym.Env, ABC): self.render_kwargs = {} self.time_steps = np.linspace(0, self.duration, self.traj_steps) self.mp.set_mp_times(self.time_steps) + # self.mp.set_mp_duration(self.time_steps, dt) # action_bounds = np.inf * np.ones((np.prod(self.mp.num_params))) - self.mp_action_space = self.set_mp_action_space() + self.mp_action_space = self.get_mp_action_space() - self.action_space = self.set_action_space() + self.action_space = self.get_action_space() self.active_obs = self.set_active_obs() self.observation_space = spaces.Box(low=self.env.observation_space.low[self.active_obs], high=self.env.observation_space.high[self.active_obs], dtype=self.env.observation_space.dtype) + self.verbose = verbose + def get_trajectory(self, action: np.ndarray) -> Tuple: # TODO: this follows the implementation of the mp_pytorch library which includes the parameters tau and delay at # the beginning of the array. @@ -69,33 +77,37 @@ class EpisodicWrapper(gym.Env, ABC): scaled_mp_params[ignore_indices:] *= self.weight_scale self.mp.set_params(np.clip(scaled_mp_params, self.mp_action_space.low, self.mp_action_space.high)) self.mp.set_boundary_conditions(bc_time=self.time_steps[:1], bc_pos=self.current_pos, bc_vel=self.current_vel) - traj_dict = self.mp.get_mp_trajs(get_pos = True, get_vel = True) + traj_dict = self.mp.get_mp_trajs(get_pos=True, get_vel=True) trajectory_tensor, velocity_tensor = traj_dict['pos'], traj_dict['vel'] trajectory = trajectory_tensor.numpy() velocity = velocity_tensor.numpy() + # TODO: Do we need this or does mp_pytorch have this? if self.post_traj_steps > 0: trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])]) velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dof))]) return trajectory, velocity - def set_mp_action_space(self): + def get_mp_action_space(self): """This function can be used to set up an individual space for the parameters of the mp.""" min_action_bounds, max_action_bounds = self.mp.get_param_bounds() mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(), - dtype=np.float32) + dtype=np.float32) return mp_action_space - def set_action_space(self): + def get_action_space(self): """ This function can be used to modify the action space for considering actions which are not learned via motion primitives. E.g. ball releasing time for the beer pong task. By default, it is the parameter space of the motion primitive. Only needs to be overwritten if the action space needs to be modified. """ - return self.mp_action_space + try: + return self.mp_action_space + except AttributeError: + return self.get_mp_action_space() def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]: """ @@ -111,7 +123,8 @@ class EpisodicWrapper(gym.Env, ABC): """ return action, None - def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]: + def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[ + np.ndarray]: """ This function can be used to modify the step_action with additional parameters e.g. releasing the ball in the Beerpong env. The parameters used should not be part of the motion primitive parameters. @@ -169,11 +182,15 @@ class EpisodicWrapper(gym.Env, ABC): mp_params, env_spec_params = self._episode_callback(action) trajectory, velocity = self.get_trajectory(mp_params) + # TODO + # self.time_steps = np.linspace(0, learned_duration, self.traj_steps) + # self.mp.set_mp_times(self.time_steps) + trajectory_length = len(trajectory) - if self.verbose >=2 : + if self.verbose >= 2: actions = np.zeros(shape=(trajectory_length,) + self.env.action_space.shape) observations = np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape, - dtype=self.env.observation_space.dtype) + dtype=self.env.observation_space.dtype) rewards = np.zeros(shape=(trajectory_length,)) trajectory_return = 0 @@ -181,7 +198,7 @@ class EpisodicWrapper(gym.Env, ABC): for t, pos_vel in enumerate(zip(trajectory, velocity)): step_action = self.controller.get_action(pos_vel[0], pos_vel[1], self.current_pos, self.current_vel) - step_action = self._step_callback(t, env_spec_params, step_action) # include possible callback info + step_action = self._step_callback(t, env_spec_params, step_action) # include possible callback info c_action = np.clip(step_action, self.env.action_space.low, self.env.action_space.high) # print('step/clipped action ratio: ', step_action/c_action) obs, c_reward, done, info = self.env.step(c_action) @@ -197,7 +214,7 @@ class EpisodicWrapper(gym.Env, ABC): # infos['step_infos'].append(info) if self.render_mode: self.render(mode=self.render_mode, **self.render_kwargs) - if done: + if done or do_replanning(kwargs): break infos.update({k: v[:t + 1] for k, v in infos.items()}) if self.verbose >= 2: @@ -235,10 +252,10 @@ class EpisodicWrapper(gym.Env, ABC): for i in range(des_trajs.shape[1]): plt.figure(pos_fig.number) plt.subplot(des_trajs.shape[1], 1, i + 1) - plt.plot(np.ones(des_trajs.shape[0])*self.current_pos[i]) + plt.plot(np.ones(des_trajs.shape[0]) * self.current_pos[i]) plt.plot(des_trajs[:, i]) plt.figure(vel_fig.number) plt.subplot(des_vels.shape[1], 1, i + 1) - plt.plot(np.ones(des_trajs.shape[0])*self.current_vel[i]) + plt.plot(np.ones(des_trajs.shape[0]) * self.current_vel[i]) plt.plot(des_vels[:, i]) diff --git a/alr_envs/utils/__init__.py b/alr_envs/utils/__init__.py index bb4b0bc..a96d882 100644 --- a/alr_envs/utils/__init__.py +++ b/alr_envs/utils/__init__.py @@ -20,12 +20,13 @@ def make_dmc( environment_kwargs: dict = {}, time_limit: Union[None, float] = None, channels_first: bool = True -): + ): # Adopted from: https://github.com/denisyarats/dmc2gym/blob/master/dmc2gym/__init__.py # License: MIT # Copyright (c) 2020 Denis Yarats - assert re.match(r"\w+-\w+", id), "env_id does not have the following structure: 'domain_name-task_name'" + if not re.match(r"\w+-\w+", id): + raise ValueError("env_id does not have the following structure: 'domain_name-task_name'") domain_name, task_name = id.split("-") env_id = f'dmc_{domain_name}_{task_name}_{seed}-v1' @@ -60,7 +61,7 @@ def make_dmc( camera_id=camera_id, frame_skip=frame_skip, channels_first=channels_first, - ), + ), max_episode_steps=max_episode_steps, - ) + ) return gym.make(env_id) diff --git a/alr_envs/utils/make_env_helpers.py b/alr_envs/utils/make_env_helpers.py index 301a5aa..dbefeee 100644 --- a/alr_envs/utils/make_env_helpers.py +++ b/alr_envs/utils/make_env_helpers.py @@ -55,7 +55,7 @@ def make(env_id: str, seed, **kwargs): Returns: Gym environment """ - if any([det_pmp in env_id for det_pmp in ["DetPMP", "detpmp"]]): + if any(deprec in env_id for deprec in ["DetPMP", "detpmp"]): warnings.warn("DetPMP is deprecated and converted to ProMP") env_id = env_id.replace("DetPMP", "ProMP") env_id = env_id.replace("detpmp", "promp") @@ -92,14 +92,17 @@ def make(env_id: str, seed, **kwargs): from alr_envs import make_dmc env = make_dmc(env_id, seed=seed, **kwargs) - assert env.base_step_limit == env.spec.max_episode_steps, \ - f"The specified 'episode_length' of {env.spec.max_episode_steps} steps for gym is different from " \ - f"the DMC environment specification of {env.base_step_limit} steps." + if not env.base_step_limit == env.spec.max_episode_steps: + raise ValueError(f"The specified 'episode_length' of {env.spec.max_episode_steps} steps for gym " + f"is different from the DMC environment specification of {env.base_step_limit} steps.") return env -def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController, - ep_wrapper_kwargs: Mapping, seed=1, **kwargs): + +def _make_wrapped_env( + env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController, + ep_wrapper_kwargs: Mapping, seed=1, **kwargs + ): """ Helper function for creating a wrapped gym environment using MPs. It adds all provided wrappers to the specified environment and verifies at least one MPEnvWrapper is @@ -120,13 +123,14 @@ def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MP # only wrap the environment if not EpisodicWrapper, e.g. for vision if not issubclass(w, EpisodicWrapper): _env = w(_env) - else: # if EpisodicWrapper, use specific constructor + else: # if EpisodicWrapper, use specific constructor has_episodic_wrapper = True _env = w(env=_env, mp=mp, controller=controller, **ep_wrapper_kwargs) - assert has_episodic_wrapper, \ - "At least one MPEnvWrapper is required in order to leverage motion primitive environments." + if not has_episodic_wrapper: + raise ValueError("An EpisodicWrapper is required in order to leverage movement primitive environments.") return _env + def make_mp_from_kwargs( env_id: str, wrappers: Iterable, ep_wrapper_kwargs: MutableMapping, mp_kwargs: MutableMapping, controller_kwargs: MutableMapping, phase_kwargs: MutableMapping, basis_kwargs: MutableMapping, seed=1, @@ -152,7 +156,7 @@ def make_mp_from_kwargs( _verify_time_limit(mp_kwargs.get("duration", None), kwargs.get("time_limit", None)) dummy_env = make(env_id, seed) if ep_wrapper_kwargs.get('duration', None) is None: - ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps*dummy_env.dt + ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps * dummy_env.dt if phase_kwargs.get('tau', None) is None: phase_kwargs['tau'] = ep_wrapper_kwargs['duration'] mp_kwargs['action_dim'] = mp_kwargs.get('action_dim', np.prod(dummy_env.action_space.shape).item()) @@ -207,10 +211,11 @@ def make_mp_env_helper(**kwargs): phase_kwargs = kwargs.pop("phase_generator_kwargs") basis_kwargs = kwargs.pop("basis_generator_kwargs") - return make_mp_from_kwargs(env_id=kwargs.pop("name"), wrappers=wrappers, ep_wrapper_kwargs=ep_wrapper_kwargs, + return make_mp_from_kwargs(env_id=kwargs.pop("name"), wrappers=wrappers, ep_wrapper_kwargs=ep_wrapper_kwargs, mp_kwargs=mp_kwargs, controller_kwargs=contr_kwargs, phase_kwargs=phase_kwargs, basis_kwargs=basis_kwargs, **kwargs, seed=seed) + def make_dmp_env(env_id: str, wrappers: Iterable, seed=1, mp_kwargs={}, **kwargs): """ This can also be used standalone for manually building a custom DMP environment.