first clean up and some non working ideas sketched

2022-06-28 16:05:09 +02:00 · 2022-06-28 16:05:09 +02:00 · 9ad6fbe712
commit 9ad6fbe712
parent 7bd9848c31
10 changed files with 225 additions and 260 deletions
--- a/README.md
+++ b/README.md
@ -1,26 +1,32 @@
 ## ALR Robotics Control Environments

 This project offers a large variety of reinforcement learning environments under the unifying interface of [OpenAI gym](https://gym.openai.com/).
-Besides, we also provide support (under the OpenAI interface) for the benchmark suites
+We provide support (under the OpenAI interface) for the benchmark suites
 [DeepMind Control](https://deepmind.com/research/publications/2020/dm-control-Software-and-Tasks-for-Continuous-Control)
-(DMC) and [Metaworld](https://meta-world.github.io/). Custom (Mujoco) gym environments can be created according
-to [this guide](https://github.com/openai/gym/blob/master/docs/creating-environments.md). Unlike existing libraries, we
-additionally support to control agents with Dynamic Movement Primitives (DMPs) and Probabilistic Movement Primitives (ProMP,
-we only consider the mean usually).
+(DMC) and [Metaworld](https://meta-world.github.io/). 
+Custom (Mujoco) gym environments can be created according
+to [this guide](https://www.gymlibrary.ml/content/environment_creation/). 
+Unlike existing libraries, we additionally support to control agents with movement primitives, such as 
+Dynamic Movement Primitives (DMPs) and Probabilistic Movement Primitives (ProMP, we only consider the mean usually).

-## Motion Primitive Environments (Episodic environments)
+## Movement Primitive Environments (Episode-Based/Black-Box Environments)

-Unlike step-based environments, motion primitive (MP) environments are closer related to stochastic search, black box
-optimization, and methods that are often used in robotics. MP environments are trajectory-based and always execute a full
-trajectory, which is generated by a Dynamic Motion Primitive (DMP) or a Probabilistic Motion Primitive (ProMP). The
-generated trajectory is translated into individual step-wise actions by a controller. The exact choice of controller is,
-however, dependent on the type of environment. We currently support position, velocity, and PD-Controllers for position,
-velocity, and torque control, respectively. The goal of all MP environments is still to learn a policy. Yet, an action
+Unlike step-based environments, movement primitive (MP) environments are closer related to stochastic search, black-box
+optimization, and methods that are often used in traditional robotics and control. 
+MP environments are episode-based and always execute a full trajectory, which is generated by a trajectory generator, 
+such as a Dynamic Movement Primitive (DMP) or a Probabilistic Movement Primitive (ProMP). 
+The generated trajectory is translated into individual step-wise actions by a trajectory tracking controller. 
+The exact choice of controller is, however, dependent on the type of environment. 
+We currently support position, velocity, and PD-Controllers for position, velocity, and torque control, respectively 
+as well as a special controller for the MetaWorld control suite.  
+The goal of all MP environments is still to learn a optimal policy. Yet, an action
 represents the parametrization of the motion primitives to generate a suitable trajectory. Additionally, in this
-framework we support all of this also for the contextual setting, for which we expose all changing substates of the
-task as a single observation in the beginning. This requires to predict a new action/MP parametrization for each
-trajectory. All environments provide next to the cumulative episode reward all collected information from each
-step as part of the info dictionary. This information should, however, mainly be used for debugging and logging.
+framework we support all of this also for the contextual setting, i.e. we expose a subset of the observation space 
+as a single context in the beginning of the episode. This requires to predict a new action/MP parametrization for each
+context. 
+All environments provide next to the cumulative episode reward all collected information from each
+step as part of the info dictionary. This information is, however, mainly meant for debugging as well as logging 
+and not for training. 

 |Key| Description|
 |---|---|
--- a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup.py
+++ b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup.py
@ -4,10 +4,11 @@ import numpy as np
 from gym.envs.mujoco import MujocoEnv


-
 class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
-    def __init__(self, n_substeps=4, apply_gravity_comp=True, simplified: bool = False,
-                 reward_type: str = None, context: np.ndarray = None):
+    def __init__(
+            self, n_substeps=4, apply_gravity_comp=True, simplified: bool = False,
+            reward_type: str = None, context: np.ndarray = None
+            ):
        utils.EzPickle.__init__(**locals())
        self._steps = 0

@ -23,9 +24,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):

        self.context = context

-        alr_mujoco_env.AlrMujocoEnv.__init__(self,
-                                             self.xml_path,
-                                             apply_gravity_comp=apply_gravity_comp,
+        alr_mujoco_env.AlrMujocoEnv.__init__(self, self.xml_path, apply_gravity_comp=apply_gravity_comp,
                                             n_substeps=n_substeps)
        self._start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
        self._start_vel = np.zeros(7)
--- a/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py
+++ b/alr_envs/alr/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py
@ -67,7 +67,7 @@ class BallInACupReward(alr_reward_fct.AlrReward):
        action_cost = np.sum(np.square(action))
        self.action_costs.append(action_cost)

-        self._is_collided = self.check_collision(self.env.sim) or self.env.check_traj_in_joint_limits()
+        self._is_collided = self.check_collision(self.env.sim) or self.env._check_traj_in_joint_limits()

        if self.env._steps == self.env.ep_length - 1 or self._is_collided:
            t_min_dist = np.argmin(self.dists)
--- a/alr_envs/alr/mujoco/beerpong/beerpong.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong.py
@ -1,11 +1,11 @@
-import mujoco_py.builder
 import os

+import mujoco_py.builder
 import numpy as np
-from gym import utils, spaces
+from gym import utils
 from gym.envs.mujoco import MujocoEnv
-from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward

+from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward

 CUP_POS_MIN = np.array([-1.42, -4.05])
 CUP_POS_MAX = np.array([1.42, -1.25])
@ -18,12 +18,14 @@ CUP_POS_MAX = np.array([1.42, -1.25])
 # CUP_POS_MIN = np.array([-0.16, -2.2])
 # CUP_POS_MAX = np.array([0.16, -1.7])

-
 class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
-    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False,
-                 rndm_goal=False, cup_goal_pos=None):
+    def __init__(
+            self, frame_skip=1, apply_gravity_comp=True, noisy=False,
+            rndm_goal=False, cup_goal_pos=None
+            ):
+
        cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840])
-        if cup_goal_pos.shape[0]==2:
+        if cup_goal_pos.shape[0] == 2:
            cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840)
        self.cup_goal_pos = np.array(cup_goal_pos)

@ -50,7 +52,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
        # self._release_step = 130  # time step of ball release
        self.release_step = 100  # time step of ball release

-        self.ep_length = 600//frame_skip
+        self.ep_length = 600 // frame_skip
        self.cup_table_id = 10

        if noisy:
@ -71,14 +73,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
    def start_vel(self):
        return self._start_vel

-    @property
-    def current_pos(self):
-        return self.sim.data.qpos[0:7].copy()
-
-    @property
-    def current_vel(self):
-        return self.sim.data.qvel[0:7].copy()
-
    def reset(self):
        self.reward_function.reset(self.add_noise)
        return super().reset()
@ -147,7 +141,8 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
            ball_pos = np.zeros(3)
            ball_vel = np.zeros(3)

-        infos = dict(reward_dist=reward_dist,
+        infos = dict(
+            reward_dist=reward_dist,
            reward=reward,
            velocity=angular_vel,
            # traj=self._q_pos,
@ -158,18 +153,20 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
            ball_vel=ball_vel,
            success=success,
            is_collided=is_collided, sim_crash=crash,
-                     table_contact_first=int(not self.reward_function.ball_ground_contact_first))
+            table_contact_first=int(not self.reward_function.ball_ground_contact_first)
+            )
        infos.update(reward_infos)
        return ob, reward, done, infos

-    def check_traj_in_joint_limits(self):
+    def _check_traj_in_joint_limits(self):
        return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)

    def _get_obs(self):
        theta = self.sim.data.qpos.flat[:7]
        theta_dot = self.sim.data.qvel.flat[:7]
        ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
-        cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_final_table"]].copy()
+        cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[
+            self.sim.model._site_name2id["cup_goal_final_table"]].copy()
        cup_goal_diff_top = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_table"]].copy()
        return np.concatenate([
            np.cos(theta),
@ -181,15 +178,19 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
            [self._steps],
            ])

+    def compute_reward(self):
+
    @property
    def dt(self):
-        return super(ALRBeerBongEnv, self).dt*self.repeat_action
+        return super(ALRBeerBongEnv, self).dt * self.repeat_action
+

 class ALRBeerBongEnvFixedReleaseStep(ALRBeerBongEnv):
    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
        super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
        self.release_step = 62  # empirically evaluated for frame_skip=2!

+
 class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
        super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
@ -202,7 +203,8 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
            reward = 0
            done = False
            while not done:
-                sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape))
+                sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(
+                    np.zeros(a.shape))
                reward += sub_reward
            infos = sub_infos
            ob = sub_ob
@ -211,45 +213,11 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
        return ob, reward, done, infos


-# class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
-#     def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
-#         super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
-#         self.release_step = 62  # empirically evaluated for frame_skip=2!
-#
-#     def step(self, a):
-#         if self._steps < self.release_step:
-#             return super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(a)
-#         else:
-#             sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape))
-#             reward = sub_reward
-#             infos = sub_infos
-#             ob = sub_ob
-#             ob[-1] = self.release_step + 1     # Since we simulate until the end of the episode, PPO does not see the
-#                                                # internal steps and thus, the observation also needs to be set correctly
-#         return ob, reward, done, infos
-
-
 class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
        super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
        self.release_step = 62  # empirically evaluated for frame_skip=2!

-    # def _set_action_space(self):
-    #     bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space()
-    #     min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype)
-    #     max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype)
-    #     self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype)
-    #     return self.action_space
-
-    # def step(self, a):
-    #     self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step
-    #     return super(ALRBeerBongEnvStepBased, self).step(a[1:])
-    #
-    # def reset(self):
-    #     ob = super(ALRBeerBongEnvStepBased, self).reset()
-    #     self.release_step = self.ep_length + 1
-    #     return ob
-
    def step(self, a):
        if self._steps < self.release_step:
            return super(ALRBeerBongEnvStepBased, self).step(a)
@ -267,9 +235,9 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
                    cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
                        self.sim.model._site_name2id["cup_goal_table"]].copy())
                    if sub_infos['success']:
-                        dist_rew = -cup_goal_dist_final**2
+                        dist_rew = -cup_goal_dist_final ** 2
                    else:
-                        dist_rew = -0.5*cup_goal_dist_final**2 - cup_goal_dist_top**2
+                        dist_rew = -0.5 * cup_goal_dist_final ** 2 - cup_goal_dist_top ** 2
                    reward = reward - sub_infos['action_cost'] + dist_rew
            infos = sub_infos
            ob = sub_ob
@ -278,13 +246,13 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
        return ob, reward, done, infos


-
 if __name__ == "__main__":
    # env = ALRBeerBongEnv(rndm_goal=True)
    # env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True)
    # env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True)
    env = ALRBeerBongEnvFixedReleaseStep(frame_skip=2, rndm_goal=True)
    import time
+
    env.reset()
    env.render("human")
    for i in range(1500):
--- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
@ -29,52 +29,38 @@ class BeerPongReward:
                                      # "cup_base_table"
                                      ]

-
-        self.ball_traj = None
        self.dists = None
        self.dists_final = None
-        self.costs = None
        self.action_costs = None
-        self.angle_rewards = None
-        self.cup_angles = None
-        self.cup_z_axes = None
-        self.collision_penalty = 500
-        self.reset(None)
+        self.reset()
        self.is_initialized = False

-    def reset(self, noisy):
-        self.ball_traj = []
+    def reset(self):
        self.dists = []
        self.dists_final = []
-        self.costs = []
        self.action_costs = []
-        self.angle_rewards = []
-        self.cup_angles = []
-        self.cup_z_axes = []
        self.ball_ground_contact_first = False
        self.ball_table_contact = False
        self.ball_wall_contact = False
        self.ball_cup_contact = False
        self.ball_in_cup = False
        self.dist_ground_cup = -1  # distance floor to cup if first floor contact
-        self.noisy_bp = noisy
-        self._t_min_final_dist = -1

    def initialize(self, env):

        if not self.is_initialized:
            self.is_initialized = True
            self.ball_id = env.sim.model._body_name2id["ball"]
-            self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"]
            self.goal_id = env.sim.model._site_name2id["cup_goal_table"]
            self.goal_final_id = env.sim.model._site_name2id["cup_goal_final_table"]
-            self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects]
            self.cup_table_id = env.sim.model._body_name2id["cup_table"]
+
+            self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"]
            self.table_collision_id = env.sim.model._geom_name2id["table_contact_geom"]
            self.wall_collision_id = env.sim.model._geom_name2id["wall"]
            self.cup_table_collision_id = env.sim.model._geom_name2id["cup_base_table_contact"]
-            self.init_ball_pos_site_id = env.sim.model._site_name2id["init_ball_pos_site"]
            self.ground_collision_id = env.sim.model._geom_name2id["ground"]
+            self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects]
            self.robot_collision_ids = [env.sim.model._geom_name2id[name] for name in self.robot_collision_objects]

    def compute_reward(self, env, action):
@ -87,7 +73,7 @@ class BeerPongReward:
        self.check_contacts(env.sim)
        self.dists.append(np.linalg.norm(goal_pos - ball_pos))
        self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
-        self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \
+        self.dist_ground_cup = np.linalg.norm(ball_pos - goal_pos) \
            if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup
        action_cost = np.sum(np.square(action))
        self.action_costs.append(np.copy(action_cost))
@ -108,18 +94,18 @@ class BeerPongReward:
                    else:
                        min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -2
                else:
-                    min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0
+                    min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0, 0
            # dist_ground_cup = 1 * self.dist_ground_cup
            action_cost = 1e-4 * np.mean(action_cost)
            reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \
-                     action_cost - ground_contact_dist_coeff*self.dist_ground_cup ** 2
+                     action_cost - ground_contact_dist_coeff * self.dist_ground_cup ** 2
            # 1e-7*np.mean(action_cost)
            # release step punishment
            min_time_bound = 0.1
            max_time_bound = 1.0
-            release_time = env.release_step*env.dt
-            release_time_rew = int(release_time<min_time_bound)*(-30-10*(release_time-min_time_bound)**2) \
-                                +int(release_time>max_time_bound)*(-30-10*(release_time-max_time_bound)**2)
+            release_time = env.release_step * env.dt
+            release_time_rew = int(release_time < min_time_bound) * (-30 - 10 * (release_time - min_time_bound) ** 2) \
+                               + int(release_time > max_time_bound) * (-30 - 10 * (release_time - max_time_bound) ** 2)
            reward += release_time_rew
            success = self.ball_in_cup
            # print('release time :', release_time)
--- a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py
+++ b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py
@ -1,10 +1,15 @@
-from alr_envs.mp.episodic_wrapper import EpisodicWrapper
-from typing import Union, Tuple
+from typing import Tuple, Union
+
 import numpy as np
-import gym
+
+from alr_envs.mp.episodic_wrapper import EpisodicWrapper


 class NewMPWrapper(EpisodicWrapper):
+
+    # def __init__(self, replanning_model):
+    #     self.replanning_model = replanning_model
+
    @property
    def current_pos(self) -> Union[float, int, np.ndarray, Tuple]:
        return self.env.sim.data.qpos[0:7].copy()
@ -24,24 +29,14 @@ class NewMPWrapper(EpisodicWrapper):
            [False]  # env steps
            ])

-    # def set_mp_action_space(self):
-    #     min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
-    #     if self.mp.learn_tau:
-    #         min_action_bounds[0] = 20*self.env.dt
-    #         max_action_bounds[0] = 260*self.env.dt
-    #     mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(),
-    #                                      dtype=np.float32)
-    #     return mp_action_space
-
-    # def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]:
-    #     if self.mp.learn_tau:
-    #         return np.concatenate((step_action, np.atleast_1d(env_spec_params)))
-    #     else:
-    #         return step_action
+    def do_replanning(self, pos, vel, s, a, t, last_replan_step):
+        return False
+        # const = np.arange(0, 1000, 10)
+        # return bool(self.replanning_model(s))

    def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
        if self.mp.learn_tau:
-            self.env.env.release_step = action[0]/self.env.dt         # Tau value
+            self.env.env.release_step = action[0] / self.env.dt  # Tau value
            return action, None
        else:
            return action, None
@ -52,12 +47,3 @@ class NewMPWrapper(EpisodicWrapper):
        xyz[-1] = 0.840
        self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz
        return self.get_observation_from_step(self.env.env._get_obs())
-    # def set_action_space(self):
-    #     if self.mp.learn_tau:
-    #         min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
-    #         min_action_bounds = np.concatenate((min_action_bounds.numpy(), [self.env.action_space.low[-1]]))
-    #         max_action_bounds = np.concatenate((max_action_bounds.numpy(), [self.env.action_space.high[-1]]))
-    #         self.action_space = gym.spaces.Box(low=min_action_bounds, high=max_action_bounds, dtype=np.float32)
-    #         return self.action_space
-    #     else:
-    #         return super(NewMPWrapper, self).set_action_space()
--- a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py
+++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py
@ -14,7 +14,8 @@ class ALRHopperJumpEnv(HopperEnv):
    - exclude current positions from observatiosn is set to False
    """

-    def __init__(self,
+    def __init__(
+            self,
            xml_file='hopper_jump.xml',
            forward_reward_weight=1.0,
            ctrl_cost_weight=1e-3,
@ -27,7 +28,9 @@ class ALRHopperJumpEnv(HopperEnv):
            healthy_angle_range=(-float('inf'), float('inf')),
            reset_noise_scale=5e-3,
            exclude_current_positions_from_observation=False,
-                 max_episode_steps=250):
+            max_episode_steps=250
+            ):
+
        self.current_step = 0
        self.max_height = 0
        self.max_episode_steps = max_episode_steps
@ -59,22 +62,14 @@ class ALRHopperJumpEnv(HopperEnv):
        done = False
        rewards = 0
        if self.current_step >= self.max_episode_steps:
-            hight_goal_distance = -10*np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height
+            hight_goal_distance = -10 * np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height
            healthy_reward = 0 if self.context else self.healthy_reward * 2  # self.current_step
            height_reward = self._forward_reward_weight * hight_goal_distance  # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley
            rewards = height_reward + healthy_reward

-        # else:
-        #     # penalty for wrong start direction of first two joints; not needed, could be removed
-        #     rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0
-
        observation = self._get_obs()
        reward = rewards - costs
-        # info = {
-        #     'height'    : height_after,
-        #     'max_height': self.max_height,
-        #     'goal' : self.goal
-        # }
+
        info = {
            'height': height_after,
            'x_pos': site_pos_after,
@ -157,9 +152,9 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
        rewards = 0
        if self.current_step >= self.max_episode_steps:
            # healthy_reward = 0 if self.context else self.healthy_reward * self.current_step
-            healthy_reward = self.healthy_reward * 2 #* self.current_step
+            healthy_reward = self.healthy_reward * 2  # * self.current_step
            contact_dist = self.contact_dist if self.contact_dist is not None else 5
-            dist_reward = self._forward_reward_weight * (-3*goal_dist + 10*self.max_height - 2*contact_dist)
+            dist_reward = self._forward_reward_weight * (-3 * goal_dist + 10 * self.max_height - 2 * contact_dist)
            rewards = dist_reward + healthy_reward

        observation = self._get_obs()
@ -225,9 +220,11 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
        self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0])
        return self._get_obs()

+
 class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):

-    def __init__(self,
+    def __init__(
+            self,
            xml_file='hopper_jump.xml',
            forward_reward_weight=1.0,
            ctrl_cost_weight=1e-3,
@ -241,9 +238,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
            reset_noise_scale=5e-3,
            exclude_current_positions_from_observation=False,
            max_episode_steps=250,
-                 height_scale = 10,
-                 dist_scale = 3,
-                 healthy_scale = 2):
+            height_scale=10,
+            dist_scale=3,
+            healthy_scale=2
+            ):
        self.height_scale = height_scale
        self.dist_scale = dist_scale
        self.healthy_scale = healthy_scale
@ -263,15 +261,14 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
        ctrl_cost = self.control_cost(action)

        healthy_reward = self.healthy_reward * self.healthy_scale
-        height_reward = self.height_scale*height_after
+        height_reward = self.height_scale * height_after
        goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0]
-        goal_dist_reward = -self.dist_scale*goal_dist
+        goal_dist_reward = -self.dist_scale * goal_dist
        dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward)
        reward = -ctrl_cost + healthy_reward + dist_reward
        done = False
        observation = self._get_obs()

-
        ###########################################################
        # This is only for logging the distance to goal when first having the contact
        ##########################################################
@ -300,6 +297,7 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
            }
        return observation, reward, done, info

+
 class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
    def __init__(self, max_episode_steps=250):
        super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False,
@ -309,14 +307,14 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
    def reset_model(self):
        self._floor_geom_id = self.model.geom_name2id('floor')
        self._foot_geom_id = self.model.geom_name2id('foot_geom')
-        noise_low = -np.ones(self.model.nq)*self._reset_noise_scale
+        noise_low = -np.ones(self.model.nq) * self._reset_noise_scale
        noise_low[1] = 0
        noise_low[2] = 0
        noise_low[3] = -0.2
        noise_low[4] = -0.2
        noise_low[5] = -0.1

-        noise_high = np.ones(self.model.nq)*self._reset_noise_scale
+        noise_high = np.ones(self.model.nq) * self._reset_noise_scale
        noise_high[1] = 0
        noise_high[2] = 0
        noise_high[3] = 0
@ -370,7 +368,6 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
        return observation, reward, done, info


-
 if __name__ == '__main__':
    render_mode = "human"  # "human" or "partial" or "final"
    # env = ALRHopperJumpEnv()
--- a/alr_envs/mp/episodic_wrapper.py
+++ b/alr_envs/mp/episodic_wrapper.py
@ -9,7 +9,7 @@ from mp_pytorch.mp.mp_interfaces import MPInterface
 from alr_envs.mp.controllers.base_controller import BaseController


-class EpisodicWrapper(gym.Env, ABC):
+class EpisodicWrapper(gym.Env, gym.wrappers.TransformReward, ABC):
    """
    Base class for movement primitive based gym.Wrapper implementations.

@ -22,14 +22,19 @@ class EpisodicWrapper(gym.Env, ABC):
        weight_scale: Scaling parameter for the actions given to this wrapper
        render_mode: Equivalent to gym render mode
    """
-    def __init__(self,
+
+    def __init__(
+            self,
            env: gym.Env,
            mp: MPInterface,
            controller: BaseController,
            duration: float,
            render_mode: str = None,
            verbose: int = 1,
-                 weight_scale: float = 1):
+            weight_scale: float = 1,
+            sequencing=True,
+            reward_aggregation=np.mean,
+            ):
        super().__init__()

        self.env = env
@ -40,11 +45,11 @@ class EpisodicWrapper(gym.Env, ABC):
        self.duration = duration
        self.traj_steps = int(duration / self.dt)
        self.post_traj_steps = self.env.spec.max_episode_steps - self.traj_steps
+        # duration = self.env.max_episode_steps * self.dt

-        self.controller = controller
        self.mp = mp
        self.env = env
-        self.verbose = verbose
+        self.controller = controller
        self.weight_scale = weight_scale

        # rendering
@ -52,15 +57,18 @@ class EpisodicWrapper(gym.Env, ABC):
        self.render_kwargs = {}
        self.time_steps = np.linspace(0, self.duration, self.traj_steps)
        self.mp.set_mp_times(self.time_steps)
+        # self.mp.set_mp_duration(self.time_steps, dt)
        # action_bounds = np.inf * np.ones((np.prod(self.mp.num_params)))
-        self.mp_action_space = self.set_mp_action_space()
+        self.mp_action_space = self.get_mp_action_space()

-        self.action_space = self.set_action_space()
+        self.action_space = self.get_action_space()
        self.active_obs = self.set_active_obs()
        self.observation_space = spaces.Box(low=self.env.observation_space.low[self.active_obs],
                                            high=self.env.observation_space.high[self.active_obs],
                                            dtype=self.env.observation_space.dtype)

+        self.verbose = verbose
+
    def get_trajectory(self, action: np.ndarray) -> Tuple:
        # TODO: this follows the implementation of the mp_pytorch library which includes the parameters tau and delay at
        #  the beginning of the array.
@ -69,33 +77,37 @@ class EpisodicWrapper(gym.Env, ABC):
        scaled_mp_params[ignore_indices:] *= self.weight_scale
        self.mp.set_params(np.clip(scaled_mp_params, self.mp_action_space.low, self.mp_action_space.high))
        self.mp.set_boundary_conditions(bc_time=self.time_steps[:1], bc_pos=self.current_pos, bc_vel=self.current_vel)
-        traj_dict = self.mp.get_mp_trajs(get_pos = True, get_vel = True)
+        traj_dict = self.mp.get_mp_trajs(get_pos=True, get_vel=True)
        trajectory_tensor, velocity_tensor = traj_dict['pos'], traj_dict['vel']

        trajectory = trajectory_tensor.numpy()
        velocity = velocity_tensor.numpy()

+        # TODO: Do we need this or does mp_pytorch have this?
        if self.post_traj_steps > 0:
            trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
            velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dof))])

        return trajectory, velocity

-    def set_mp_action_space(self):
+    def get_mp_action_space(self):
        """This function can be used to set up an individual space for the parameters of the mp."""
        min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
        mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(),
                                         dtype=np.float32)
        return mp_action_space

-    def set_action_space(self):
+    def get_action_space(self):
        """
        This function can be used to modify the action space for considering actions which are not learned via motion
        primitives. E.g. ball releasing time for the beer pong task. By default, it is the parameter space of the
        motion primitive.
        Only needs to be overwritten if the action space needs to be modified.
        """
+        try:
            return self.mp_action_space
+        except AttributeError:
+            return self.get_mp_action_space()

    def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
        """
@ -111,7 +123,8 @@ class EpisodicWrapper(gym.Env, ABC):
        """
        return action, None

-    def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]:
+    def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[
+        np.ndarray]:
        """
        This function can be used to modify the step_action with additional parameters e.g. releasing the ball in the
        Beerpong env. The parameters used should not be part of the motion primitive parameters.
@ -169,8 +182,12 @@ class EpisodicWrapper(gym.Env, ABC):
        mp_params, env_spec_params = self._episode_callback(action)
        trajectory, velocity = self.get_trajectory(mp_params)

+        # TODO
+        # self.time_steps = np.linspace(0, learned_duration, self.traj_steps)
+        # self.mp.set_mp_times(self.time_steps)
+
        trajectory_length = len(trajectory)
-        if self.verbose >=2 :
+        if self.verbose >= 2:
            actions = np.zeros(shape=(trajectory_length,) + self.env.action_space.shape)
            observations = np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape,
                                    dtype=self.env.observation_space.dtype)
@ -197,7 +214,7 @@ class EpisodicWrapper(gym.Env, ABC):
            # infos['step_infos'].append(info)
            if self.render_mode:
                self.render(mode=self.render_mode, **self.render_kwargs)
-            if done:
+            if done or do_replanning(kwargs):
                break
        infos.update({k: v[:t + 1] for k, v in infos.items()})
        if self.verbose >= 2:
@ -235,10 +252,10 @@ class EpisodicWrapper(gym.Env, ABC):
        for i in range(des_trajs.shape[1]):
            plt.figure(pos_fig.number)
            plt.subplot(des_trajs.shape[1], 1, i + 1)
-            plt.plot(np.ones(des_trajs.shape[0])*self.current_pos[i])
+            plt.plot(np.ones(des_trajs.shape[0]) * self.current_pos[i])
            plt.plot(des_trajs[:, i])

            plt.figure(vel_fig.number)
            plt.subplot(des_vels.shape[1], 1, i + 1)
-            plt.plot(np.ones(des_trajs.shape[0])*self.current_vel[i])
+            plt.plot(np.ones(des_trajs.shape[0]) * self.current_vel[i])
            plt.plot(des_vels[:, i])
--- a/alr_envs/utils/init.py
+++ b/alr_envs/utils/init.py
@ -20,12 +20,13 @@ def make_dmc(
        environment_kwargs: dict = {},
        time_limit: Union[None, float] = None,
        channels_first: bool = True
-):
+        ):
    # Adopted from: https://github.com/denisyarats/dmc2gym/blob/master/dmc2gym/__init__.py
    # License: MIT
    # Copyright (c) 2020 Denis Yarats

-    assert re.match(r"\w+-\w+", id), "env_id does not have the following structure: 'domain_name-task_name'"
+    if not re.match(r"\w+-\w+", id):
+        raise ValueError("env_id does not have the following structure: 'domain_name-task_name'")
    domain_name, task_name = id.split("-")

    env_id = f'dmc_{domain_name}_{task_name}_{seed}-v1'
--- a/alr_envs/utils/make_env_helpers.py
+++ b/alr_envs/utils/make_env_helpers.py
@ -55,7 +55,7 @@ def make(env_id: str, seed, **kwargs):
    Returns: Gym environment

    """
-    if any([det_pmp in env_id for det_pmp in ["DetPMP", "detpmp"]]):
+    if any(deprec in env_id for deprec in ["DetPMP", "detpmp"]):
        warnings.warn("DetPMP is deprecated and converted to ProMP")
        env_id = env_id.replace("DetPMP", "ProMP")
        env_id = env_id.replace("detpmp", "promp")
@ -92,14 +92,17 @@ def make(env_id: str, seed, **kwargs):
            from alr_envs import make_dmc
            env = make_dmc(env_id, seed=seed, **kwargs)

-            assert env.base_step_limit == env.spec.max_episode_steps, \
-                f"The specified 'episode_length' of {env.spec.max_episode_steps} steps for gym is different from " \
-                f"the DMC environment specification of {env.base_step_limit} steps."
+            if not env.base_step_limit == env.spec.max_episode_steps:
+                raise ValueError(f"The specified 'episode_length' of {env.spec.max_episode_steps} steps for gym "
+                                 f"is different from the DMC environment specification of {env.base_step_limit} steps.")

    return env

-def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController,
-                      ep_wrapper_kwargs: Mapping, seed=1, **kwargs):
+
+def _make_wrapped_env(
+        env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController,
+        ep_wrapper_kwargs: Mapping, seed=1, **kwargs
+        ):
    """
    Helper function for creating a wrapped gym environment using MPs.
    It adds all provided wrappers to the specified environment and verifies at least one MPEnvWrapper is
@ -123,10 +126,11 @@ def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MP
        else:  # if EpisodicWrapper, use specific constructor
            has_episodic_wrapper = True
            _env = w(env=_env, mp=mp, controller=controller, **ep_wrapper_kwargs)
-    assert has_episodic_wrapper, \
-            "At least one MPEnvWrapper is required in order to leverage motion primitive environments."
+    if not has_episodic_wrapper:
+        raise ValueError("An EpisodicWrapper is required in order to leverage movement primitive environments.")
    return _env

+
 def make_mp_from_kwargs(
        env_id: str, wrappers: Iterable, ep_wrapper_kwargs: MutableMapping, mp_kwargs: MutableMapping,
        controller_kwargs: MutableMapping, phase_kwargs: MutableMapping, basis_kwargs: MutableMapping, seed=1,
@ -152,7 +156,7 @@ def make_mp_from_kwargs(
    _verify_time_limit(mp_kwargs.get("duration", None), kwargs.get("time_limit", None))
    dummy_env = make(env_id, seed)
    if ep_wrapper_kwargs.get('duration', None) is None:
-        ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps*dummy_env.dt
+        ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps * dummy_env.dt
    if phase_kwargs.get('tau', None) is None:
        phase_kwargs['tau'] = ep_wrapper_kwargs['duration']
    mp_kwargs['action_dim'] = mp_kwargs.get('action_dim', np.prod(dummy_env.action_space.shape).item())
@ -211,6 +215,7 @@ def make_mp_env_helper(**kwargs):
                               mp_kwargs=mp_kwargs, controller_kwargs=contr_kwargs, phase_kwargs=phase_kwargs,
                               basis_kwargs=basis_kwargs, **kwargs, seed=seed)

+
 def make_dmp_env(env_id: str, wrappers: Iterable, seed=1, mp_kwargs={}, **kwargs):
    """
    This can also be used standalone for manually building a custom DMP environment.