From f5f12c846fed80634bfdf7aa5fbed720914a33a4 Mon Sep 17 00:00:00 2001
From: Maximilian Huettenrauch <max.huettenrauch@gmail.com>
Date: Fri, 21 May 2021 15:44:49 +0200
Subject: [PATCH] updates on mp wrappers and some bugfixes

---
 alr_envs/__init__.py                          | 54 ++++++++++++++++---
 alr_envs/classic_control/hole_reacher.py      |  6 +--
 alr_envs/classic_control/simple_reacher.py    |  4 +-
 alr_envs/classic_control/viapoint_reacher.py  |  4 +-
 alr_envs/mujoco/alr_mujoco_env.py             | 17 +++---
 .../mujoco/ball_in_a_cup/ball_in_a_cup.py     | 17 +++---
 alr_envs/mujoco/beerpong/beerpong.py          |  5 ++
 alr_envs/utils/mp_env_async_sampler.py        | 14 ++++-
 alr_envs/utils/mps/detpmp_wrapper.py          | 15 +++---
 alr_envs/utils/mps/dmp_wrapper.py             | 10 ++--
 alr_envs/utils/mps/mp_environments.py         |  2 +-
 alr_envs/utils/mps/mp_wrapper.py              | 11 ++--
 example.py                                    | 40 +++++++++++---
 13 files changed, 145 insertions(+), 54 deletions(-)

diff --git a/alr_envs/__init__.py b/alr_envs/__init__.py
index fa73b7a..2a8054c 100644
--- a/alr_envs/__init__.py
+++ b/alr_envs/__init__.py
@@ -272,12 +272,20 @@ for v in versions:
         }
     )
 
-# register(
-#     id='HoleReacherDetPMP-v0',
-#     entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp',
-#     # max_episode_steps=1,
-#     # TODO: add mp kwargs
-# )
+    register(
+        id=f'HoleReacherDetPMP-{v}',
+        entry_point='alr_envs.utils.make_env_helpers:make_detpmp_env',
+        kwargs={
+            "name": f"alr_envs:HoleReacher-{v}",
+            "num_dof": 5,
+            "num_basis": 5,
+            "duration": 2,
+            "width": 0.01,
+            "policy_type": "velocity",
+            "weights_scale": 0.2,
+            "zero_start": True
+        }
+    )
 
 # TODO: properly add final_pos
 register(
@@ -335,6 +343,40 @@ register(
     }
 )
 
+register(
+    id='ALRBallInACupSimpleDetPMP-v0',
+    entry_point='alr_envs.utils.make_env_helpers:make_detpmp_env',
+    kwargs={
+        "name": "alr_envs:ALRBallInACupSimple-v0",
+        "num_dof": 3,
+        "num_basis": 5,
+        "duration": 3.5,
+        "post_traj_time": 4.5,
+        "width": 0.005,
+        "policy_type": "motor",
+        "weights_scale": 0.2,
+        "zero_start": True,
+        "zero_goal": True
+    }
+)
+
+register(
+    id='ALRBallInACupDetPMP-v0',
+    entry_point='alr_envs.utils.make_env_helpers:make_detpmp_env',
+    kwargs={
+        "name": "alr_envs:ALRBallInACupSimple-v0",
+        "num_dof": 7,
+        "num_basis": 5,
+        "duration": 3.5,
+        "post_traj_time": 4.5,
+        "width": 0.0035,
+        "policy_type": "motor",
+        "weights_scale": 0.2,
+        "zero_start": True,
+        "zero_goal": True
+    }
+)
+
 register(
     id='ALRBallInACupGoalDMP-v0',
     entry_point='alr_envs.utils.make_env_helpers:make_contextual_env',
diff --git a/alr_envs/classic_control/hole_reacher.py b/alr_envs/classic_control/hole_reacher.py
index a4f1f33..1880ce3 100644
--- a/alr_envs/classic_control/hole_reacher.py
+++ b/alr_envs/classic_control/hole_reacher.py
@@ -7,10 +7,10 @@ from gym.utils import seeding
 from matplotlib import patches
 
 from alr_envs.classic_control.utils import check_self_collision
-from alr_envs.utils.mps.mp_environments import MPEnv
+from alr_envs.utils.mps.mp_environments import AlrEnv
 
 
-class HoleReacherEnv(MPEnv):
+class HoleReacherEnv(AlrEnv):
 
     def __init__(self, n_links: int, hole_x: Union[None, float] = None, hole_depth: Union[None, float] = None,
                  hole_width: float = 1., random_start: bool = False, allow_self_collision: bool = False,
@@ -71,11 +71,11 @@ class HoleReacherEnv(MPEnv):
         A single step with an action in joint velocity space
         """
 
+        acc = (action - self._angle_velocity) / self.dt
         self._angle_velocity = action
         self._joint_angles = self._joint_angles + self.dt * self._angle_velocity
         self._update_joints()
 
-        acc = (action - self._angle_velocity) / self.dt
         reward, info = self._get_reward(acc)
 
         info.update({"is_collided": self._is_collided})
diff --git a/alr_envs/classic_control/simple_reacher.py b/alr_envs/classic_control/simple_reacher.py
index 425134d..5b04aad 100644
--- a/alr_envs/classic_control/simple_reacher.py
+++ b/alr_envs/classic_control/simple_reacher.py
@@ -5,10 +5,10 @@ import numpy as np
 from gym import spaces
 from gym.utils import seeding
 
-from alr_envs.utils.mps.mp_environments import MPEnv
+from alr_envs.utils.mps.mp_environments import AlrEnv
 
 
-class SimpleReacherEnv(MPEnv):
+class SimpleReacherEnv(AlrEnv):
     """
     Simple Reaching Task without any physics simulation.
     Returns no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions
diff --git a/alr_envs/classic_control/viapoint_reacher.py b/alr_envs/classic_control/viapoint_reacher.py
index 2897f31..8edcc69 100644
--- a/alr_envs/classic_control/viapoint_reacher.py
+++ b/alr_envs/classic_control/viapoint_reacher.py
@@ -6,10 +6,10 @@ import numpy as np
 from gym.utils import seeding
 
 from alr_envs.classic_control.utils import check_self_collision
-from alr_envs.utils.mps.mp_environments import MPEnv
+from alr_envs.utils.mps.mp_environments import AlrEnv
 
 
-class ViaPointReacher(MPEnv):
+class ViaPointReacher(AlrEnv):
 
     def __init__(self, n_links, random_start: bool = True, via_target: Union[None, Iterable] = None,
                  target: Union[None, Iterable] = None, allow_self_collision=False, collision_penalty=1000):
diff --git a/alr_envs/mujoco/alr_mujoco_env.py b/alr_envs/mujoco/alr_mujoco_env.py
index edc90b6..8dbdabb 100644
--- a/alr_envs/mujoco/alr_mujoco_env.py
+++ b/alr_envs/mujoco/alr_mujoco_env.py
@@ -1,5 +1,6 @@
 from collections import OrderedDict
 import os
+from abc import abstractmethod
 
 
 from gym import error, spaces
@@ -142,18 +143,20 @@ class AlrMujocoEnv(gym.Env):
     # methods to override:
     # ----------------------------
 
+    @property
+    @abstractmethod
+    def active_obs(self):
+        """Returns boolean mask for each observation entry
+        whether the observation is returned for the contextual case or not.
+        This effectively allows to filter unwanted or unnecessary observations from the full step-based case.
+        """
+        return np.ones(self.observation_space.shape, dtype=bool)
+
     def _get_obs(self):
         """Returns the observation.
         """
         raise NotImplementedError()
 
-    def configure(self, *args, **kwargs):
-        """
-        Helper method to set certain environment properties such as contexts in contextual environments since reset()
-        doesn't take arguments. Should be called before reset().
-        """
-        pass
-
     def reset_model(self):
         """
         Reset the robot degrees of freedom (qpos and qvel).
diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py
index 66461c0..613b631 100644
--- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py
+++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py
@@ -22,7 +22,7 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
         self.j_min = np.array([-2.6, -1.985, -2.8, -0.9, -4.55, -1.5707, -2.7])
         self.j_max = np.array([2.6, 1.985, 2.8, 3.14159, 1.25, 1.5707, 2.7])
 
-        self.context = None
+        self.context = context
 
         utils.EzPickle.__init__(self)
         alr_mujoco_env.AlrMujocoEnv.__init__(self,
@@ -45,7 +45,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
         else:
             raise ValueError("Unknown reward type")
         self.reward_function = reward_function(self.sim_steps)
-        self.configure(context)
 
     @property
     def start_pos(self):
@@ -69,10 +68,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
     def current_vel(self):
         return self.sim.data.qvel[0:7].copy()
 
-    def configure(self, context):
-        self.context = context
-        self.reward_function.reset(context)
-
     def reset_model(self):
         init_pos_all = self.init_qpos.copy()
         init_pos_robot = self._start_pos
@@ -129,6 +124,16 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
             [self._steps],
         ])
 
+    # TODO
+    @property
+    def active_obs(self):
+        return np.hstack([
+            [False] * 7,  # cos
+            [False] * 7,  # sin
+            # [True] * 2,  # x-y coordinates of target distance
+            [False]  # env steps
+        ])
+
     # These functions are for the task with 3 joint actuations
     def extend_des_pos(self, des_pos):
         des_pos_full = self._start_pos.copy()
diff --git a/alr_envs/mujoco/beerpong/beerpong.py b/alr_envs/mujoco/beerpong/beerpong.py
index dc6278d..5efc431 100644
--- a/alr_envs/mujoco/beerpong/beerpong.py
+++ b/alr_envs/mujoco/beerpong/beerpong.py
@@ -105,6 +105,7 @@ class ALRBeerpongEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
     def check_traj_in_joint_limits(self):
         return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
 
+    # TODO
     def _get_obs(self):
         theta = self.sim.data.qpos.flat[:7]
         return np.concatenate([
@@ -114,6 +115,10 @@ class ALRBeerpongEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
             [self._steps],
         ])
 
+    # TODO
+    def active_obs(self):
+        pass
+
 
 
 if __name__ == "__main__":
diff --git a/alr_envs/utils/mp_env_async_sampler.py b/alr_envs/utils/mp_env_async_sampler.py
index e935ba6..5cda41f 100644
--- a/alr_envs/utils/mp_env_async_sampler.py
+++ b/alr_envs/utils/mp_env_async_sampler.py
@@ -40,6 +40,16 @@ def _flatten_list(l):
     return [l__ for l_ in l for l__ in l_]
 
 
+class DummyDist:
+    def __init__(self, dim):
+        self.dim = dim
+
+    def sample(self, contexts):
+        contexts = np.atleast_2d(contexts)
+        n_samples = contexts.shape[0]
+        return np.random.normal(size=(n_samples, self.dim)), contexts
+
+
 class AlrMpEnvSampler:
     """
     An asynchronous sampler for non contextual MPWrapper environments. A sampler object can be called with a set of
@@ -100,9 +110,9 @@ class AlrContextualMpEnvSampler:
 
 
 if __name__ == "__main__":
-    env_name = "alr_envs:ALRBallInACupSimpleDMP-v0"
+    env_name = "alr_envs:HoleReacherDetPMP-v1"
     n_cpu = 8
-    dim = 15
+    dim = 25
     n_samples = 10
 
     sampler = AlrMpEnvSampler(env_name, num_envs=n_cpu)
diff --git a/alr_envs/utils/mps/detpmp_wrapper.py b/alr_envs/utils/mps/detpmp_wrapper.py
index 3f6d1ee..98060e2 100644
--- a/alr_envs/utils/mps/detpmp_wrapper.py
+++ b/alr_envs/utils/mps/detpmp_wrapper.py
@@ -2,12 +2,12 @@ import gym
 import numpy as np
 from mp_lib import det_promp
 
-from alr_envs.utils.mps.mp_environments import MPEnv
+from alr_envs.utils.mps.mp_environments import AlrEnv
 from alr_envs.utils.mps.mp_wrapper import MPWrapper
 
 
 class DetPMPWrapper(MPWrapper):
-    def __init__(self, env: MPEnv, num_dof: int, num_basis: int, width: int, duration: int = 1, dt: float = 0.01,
+    def __init__(self, env: AlrEnv, num_dof: int, num_basis: int, width: float, duration: float = 1, dt: float = 0.01,
                  post_traj_time: float = 0., policy_type: str = None, weights_scale: float = 1.,
                  zero_start: bool = False, zero_goal: bool = False, **mp_kwargs):
         self.duration = duration  # seconds
@@ -15,15 +15,16 @@ class DetPMPWrapper(MPWrapper):
         super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, num_basis=num_basis,
                          width=width, zero_start=zero_start, zero_goal=zero_goal, **mp_kwargs)
 
-        self.dt = dt
+        self.dt = env.dt if hasattr(env, "dt") else dt
+        assert self.dt is not None
 
         action_bounds = np.inf * np.ones((self.mp.n_basis * self.mp.n_dof))
         self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32)
 
 
     def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, width: float = None,
-                      zero_start: bool = False, zero_goal: bool = False):
-        pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=0.01,
+                      off: float = 0.01, zero_start: bool = False, zero_goal: bool = False):
+        pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=off,
                                            zero_start=zero_start, zero_goal=zero_goal)
 
         weights = np.zeros(shape=(num_basis, num_dof))
@@ -32,10 +33,10 @@ class DetPMPWrapper(MPWrapper):
         return pmp
 
     def mp_rollout(self, action):
-        params = np.reshape(action, (self.mp.n_basis, self.mp.n_dof)) * self.weights_scale
+        params = np.reshape(action, newshape=(self.mp.n_basis, self.mp.n_dof)) * self.weights_scale
         self.mp.set_weights(self.duration, params)
         _, des_pos, des_vel, _ = self.mp.compute_trajectory(1 / self.dt, 1.)
         if self.mp.zero_start:
-            des_pos += self.start_pos[None, :]
+            des_pos += self.env.start_pos[None, :]
 
         return des_pos, des_vel
diff --git a/alr_envs/utils/mps/dmp_wrapper.py b/alr_envs/utils/mps/dmp_wrapper.py
index 6bc5bc6..4958a98 100644
--- a/alr_envs/utils/mps/dmp_wrapper.py
+++ b/alr_envs/utils/mps/dmp_wrapper.py
@@ -4,13 +4,13 @@ from mp_lib import dmps
 from mp_lib.basis import DMPBasisGenerator
 from mp_lib.phase import ExpDecayPhaseGenerator
 
-from alr_envs.utils.mps.mp_environments import MPEnv
+from alr_envs.utils.mps.mp_environments import AlrEnv
 from alr_envs.utils.mps.mp_wrapper import MPWrapper
 
 
 class DmpWrapper(MPWrapper):
 
-    def __init__(self, env: MPEnv, num_dof: int, num_basis: int,
+    def __init__(self, env: AlrEnv, num_dof: int, num_basis: int,
                  duration: int = 1, alpha_phase: float = 2., dt: float = None,
                  learn_goal: bool = False, post_traj_time: float = 0.,
                  weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3.,
@@ -40,7 +40,7 @@ class DmpWrapper(MPWrapper):
         super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, render_mode,
                          num_basis=num_basis, alpha_phase=alpha_phase, bandwidth_factor=bandwidth_factor)
 
-        action_bounds = np.inf * np.ones((np.prod(self.mp.dmp_weights.shape) + (num_dof if learn_goal else 0)))
+        action_bounds = np.inf * np.ones((np.prod(self.mp.weights.shape) + (num_dof if learn_goal else 0)))
         self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32)
 
     def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, alpha_phase: float = 2.,
@@ -51,7 +51,7 @@ class DmpWrapper(MPWrapper):
                                             basis_bandwidth_factor=bandwidth_factor)
 
         dmp = dmps.DMP(num_dof=num_dof, basis_generator=basis_generator, phase_generator=phase_generator,
-                       num_time_steps=int(duration / dt), dt=dt)
+                       duration=duration, dt=dt)
 
         return dmp
 
@@ -66,7 +66,7 @@ class DmpWrapper(MPWrapper):
             goal_pos = self.env.goal_pos
             assert goal_pos is not None
 
-        weight_matrix = np.reshape(params, self.mp.dmp_weights.shape)  # [num_basis, num_dof]
+        weight_matrix = np.reshape(params, self.mp.weights.shape)  # [num_basis, num_dof]
         return goal_pos * self.goal_scale, weight_matrix * self.weights_scale
 
     def mp_rollout(self, action):
diff --git a/alr_envs/utils/mps/mp_environments.py b/alr_envs/utils/mps/mp_environments.py
index f397491..60db99b 100644
--- a/alr_envs/utils/mps/mp_environments.py
+++ b/alr_envs/utils/mps/mp_environments.py
@@ -5,7 +5,7 @@ import gym
 import numpy as np
 
 
-class MPEnv(gym.Env):
+class AlrEnv(gym.Env):
 
     @property
     @abstractmethod
diff --git a/alr_envs/utils/mps/mp_wrapper.py b/alr_envs/utils/mps/mp_wrapper.py
index 4c92806..c31072f 100644
--- a/alr_envs/utils/mps/mp_wrapper.py
+++ b/alr_envs/utils/mps/mp_wrapper.py
@@ -3,13 +3,13 @@ from abc import ABC, abstractmethod
 import gym
 import numpy as np
 
-from alr_envs.utils.mps.mp_environments import MPEnv
+from alr_envs.utils.mps.mp_environments import AlrEnv
 from alr_envs.utils.policies import get_policy_class
 
 
 class MPWrapper(gym.Wrapper, ABC):
 
-    def __init__(self, env: MPEnv, num_dof: int, dt: float, duration: int = 1, post_traj_time: float = 0.,
+    def __init__(self, env: AlrEnv, num_dof: int, dt: float, duration: float = 1, post_traj_time: float = 0.,
                  policy_type: str = None, weights_scale: float = 1., render_mode: str = None, **mp_kwargs):
         super().__init__(env)
 
@@ -53,9 +53,6 @@ class MPWrapper(gym.Wrapper, ABC):
 
         return obs, np.array(rewards), dones, infos
 
-    def configure(self, context):
-        self.env.configure(context)
-
     def reset(self):
         return self.env.reset()[self.env.active_obs]
 
@@ -65,7 +62,7 @@ class MPWrapper(gym.Wrapper, ABC):
 
         if self.post_traj_steps > 0:
             trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
-            velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dimensions))])
+            velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.n_dof))])
 
         # self._trajectory = trajectory
         # self._velocity = velocity
@@ -105,7 +102,7 @@ class MPWrapper(gym.Wrapper, ABC):
         raise NotImplementedError()
 
     @abstractmethod
-    def initialize_mp(self, num_dof: int, duration: int, dt: float, **kwargs):
+    def initialize_mp(self, num_dof: int, duration: float, dt: float, **kwargs):
         """
         Create respective instance of MP
         Returns:
diff --git a/example.py b/example.py
index 166f38f..af30138 100644
--- a/example.py
+++ b/example.py
@@ -1,6 +1,7 @@
 from collections import defaultdict
 import gym
 import numpy as np
+from alr_envs.utils.mp_env_async_sampler import AlrMpEnvSampler, AlrContextualMpEnvSampler, DummyDist
 
 
 def example_mujoco():
@@ -22,9 +23,9 @@ def example_mujoco():
             obs = env.reset()
 
 
-def example_dmp():
+def example_mp(env_name="alr_envs:HoleReacherDMP-v0"):
     # env = gym.make("alr_envs:ViaPointReacherDMP-v0")
-    env = gym.make("alr_envs:HoleReacherDMP-v0")
+    env = gym.make(env_name)
     rewards = 0
     # env.render(mode=None)
     obs = env.reset()
@@ -79,9 +80,36 @@ def example_async(env_id="alr_envs:HoleReacherDMP-v0", n_cpu=4, seed=int('533D',
     print(sample(envs, 16))
 
 
+def example_async_sampler(env_name="alr_envs:HoleReacherDetPMP-v1", n_cpu=4):
+    n_samples = 10
+
+    sampler = AlrMpEnvSampler(env_name, num_envs=n_cpu)
+    dim = sampler.env.action_space.spaces[0].shape[0]
+
+    thetas = np.random.randn(n_samples, dim)  # usually form a search distribution
+
+    _, rewards, __, ___ = sampler(thetas)
+
+    print(rewards)
+
+
+def example_async_contextual_sampler(env_name="alr_envs:SimpleReacherDMP-v1", n_cpu=4):
+    sampler = AlrContextualMpEnvSampler(env_name, num_envs=n_cpu)
+    dim = sampler.env.action_space.spaces[0].shape[0]
+    dist = DummyDist(dim)  # needs a sample function
+
+    n_samples = 10
+    new_samples, new_contexts, obs, new_rewards, done, infos = sampler(dist, n_samples)
+
+    print(new_rewards)
+
+
 if __name__ == '__main__':
     # example_mujoco()
-    # example_dmp()
-    example_async("alr_envs:LongSimpleReacherDMP-v0", 4)
-    # env = gym.make("alr_envs:HoleReacherDMP-v0", context=0.1)
-    # env = gym.make("alr_envs:HoleReacherDMP-v1")
+    # example_dmp("alr_envs:SimpleReacherDMP-v1")
+    # example_async("alr_envs:LongSimpleReacherDMP-v0", 4)
+    # example_async_contextual_sampler()
+    # env = gym.make("alr_envs:HoleReacherDetPMP-v1")
+    env_name = "alr_envs:ALRBallInACupSimpleDetPMP-v0"
+    # example_async_sampler(env_name)
+    example_mp(env_name)