Merge pull request #2 from 1nf0rmagician/positional_env

Seperate required interface for the PD controller and update it
2021-06-22 10:26:38 +02:00 · 2021-06-22 10:26:38 +02:00 · 362d2cf24f
commit 362d2cf24f
parent 746d408a76 4279414656
12 changed files with 169 additions and 86 deletions
--- a/alr_envs/init.py
+++ b/alr_envs/init.py
@ -1,3 +1,4 @@
 import numpy as np
 from gym.envs.registration import register
 from alr_envs.stochastic_search.functions.f_rosenbrock import Rosenbrock
@ -321,7 +322,9 @@ register(
        "bandwidth_factor": 2.5,
        "policy_type": "motor",
        "weights_scale": 100,
-        "return_to_start": True
+        "return_to_start": True,
        "p_gains": np.array([4. / 3., 2.4, 2.5, 5. / 3., 2., 2., 1.25]),
        "d_gains": np.array([0.0466, 0.12, 0.125, 0.04166, 0.06, 0.06, 0.025])
    }
 )
@ -339,7 +342,9 @@ register(
        "bandwidth_factor": 2.5,
        "policy_type": "motor",
        "weights_scale": 100,
-        "return_to_start": True
+        "return_to_start": True,
        "p_gains": np.array([4. / 3., 2.4, 2.5, 5. / 3., 2., 2., 1.25]),
        "d_gains": np.array([0.0466, 0.12, 0.125, 0.04166, 0.06, 0.06, 0.025])
    }
 )
@ -357,7 +362,9 @@ register(
        "policy_type": "motor",
        "weights_scale": 0.2,
        "zero_start": True,
-        "zero_goal": True
+        "zero_goal": True,
        "p_gains": np.array([4./3., 2.4, 2.5, 5./3., 2., 2., 1.25]),
        "d_gains": np.array([0.0466, 0.12, 0.125, 0.04166, 0.06, 0.06, 0.025])
    }
 )
@ -374,7 +381,9 @@ register(
        "policy_type": "motor",
        "weights_scale": 0.2,
        "zero_start": True,
-        "zero_goal": True
+        "zero_goal": True,
        "p_gains": np.array([4./3., 2.4, 2.5, 5./3., 2., 2., 1.25]),
        "d_gains": np.array([0.0466, 0.12, 0.125, 0.04166, 0.06, 0.06, 0.025])
    }
 )
@ -392,7 +401,9 @@ register(
        "bandwidth_factor": 2.5,
        "policy_type": "motor",
        "weights_scale": 50,
-        "goal_scale": 0.1
+        "goal_scale": 0.1,
        "p_gains": np.array([4. / 3., 2.4, 2.5, 5. / 3., 2., 2., 1.25]),
        "d_gains": np.array([0.0466, 0.12, 0.125, 0.04166, 0.06, 0.06, 0.025])
    }
 )
--- a/alr_envs/classic_control/hole_reacher.py
+++ b/alr_envs/classic_control/hole_reacher.py
@ -7,7 +7,7 @@ from gym.utils import seeding
 from matplotlib import patches
 from alr_envs.classic_control.utils import check_self_collision
-from alr_envs.utils.mps.mp_environments import AlrEnv
+from alr_envs.utils.mps.alr_env import AlrEnv
 class HoleReacherEnv(AlrEnv):
--- a/alr_envs/classic_control/simple_reacher.py
+++ b/alr_envs/classic_control/simple_reacher.py
@ -5,7 +5,7 @@ import numpy as np
 from gym import spaces
 from gym.utils import seeding
-from alr_envs.utils.mps.mp_environments import AlrEnv
+from alr_envs.utils.mps.alr_env import AlrEnv
 class SimpleReacherEnv(AlrEnv):
--- a/alr_envs/classic_control/viapoint_reacher.py
+++ b/alr_envs/classic_control/viapoint_reacher.py
@ -6,7 +6,7 @@ import numpy as np
 from gym.utils import seeding
 from alr_envs.classic_control.utils import check_self_collision
-from alr_envs.utils.mps.mp_environments import AlrEnv
+from alr_envs.utils.mps.alr_env import AlrEnv
 class ViaPointReacher(AlrEnv):
--- a/alr_envs/mujoco/alr_mujoco_env.py
+++ b/alr_envs/mujoco/alr_mujoco_env.py
@ -7,7 +7,9 @@ from gym import error, spaces
 from gym.utils import seeding
 import numpy as np
 from os import path
-import gym
+
 from alr_envs.utils.mps.alr_env import AlrEnv
 from alr_envs.utils.positional_env import PositionalEnv
 try:
    import mujoco_py
@ -33,7 +35,7 @@ def convert_observation_to_space(observation):
    return space
-class AlrMujocoEnv(gym.Env):
+class AlrMujocoEnv(PositionalEnv, AlrEnv):
    """
    Superclass for all MuJoCo environments.
    """
@ -44,7 +46,7 @@ class AlrMujocoEnv(gym.Env):
        Args:
            model_path: path to xml file
            n_substeps: how many steps mujoco does per call to env.step
-            use_servo: use actuator defined in xml, use False for direct torque control
+            apply_gravity_comp: Whether gravity compensation should be active
        """
        if model_path.startswith("/"):
            fullpath = model_path
@ -73,10 +75,6 @@ class AlrMujocoEnv(gym.Env):
        self._set_action_space()
        # action = self.action_space.sample()
        # observation, _reward, done, _info = self.step(action)
        # assert not done
        observation = self._get_obs()  # TODO: is calling get_obs enough? should we call reset, or even step?
        self._set_observation_space(observation)
@ -204,7 +202,7 @@ class AlrMujocoEnv(gym.Env):
        try:
            self.sim.step()
-        except mujoco_py.builder.MujocoException as e:
+        except mujoco_py.builder.MujocoException:
            error_in_sim = True
        return error_in_sim
--- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py
+++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py
@ -16,8 +16,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
        self._q_vel = []
        # self.weight_matrix_scale = 50
        self.max_ctrl = np.array([150., 125., 40., 60., 5., 5., 2.])
        self.p_gains = 1 / self.max_ctrl * np.array([200, 300, 100, 100, 10, 10, 2.5])
        self.d_gains = 1 / self.max_ctrl * np.array([7, 15, 5, 2.5, 0.3, 0.3, 0.05])
        self.j_min = np.array([-2.6, -1.985, -2.8, -0.9, -4.55, -1.5707, -2.7])
        self.j_max = np.array([2.6, 1.985, 2.8, 3.14159, 1.25, 1.5707, 2.7])
--- a/alr_envs/utils/mps/mp_environments.py
+++ b/alr_envs/utils/mps/mp_environments.py
@ -1,14 +1,13 @@
-from abc import abstractmethod
+from abc import abstractmethod, ABC
 from typing import Union
 import gym
 import numpy as np
-class AlrEnv(gym.Env):
+class AlrEnv(gym.Env, ABC):
    @property
    @abstractmethod
    def active_obs(self):
        """Returns boolean mask for each observation entry
        whether the observation is returned for the contextual case or not.
@ -31,3 +30,11 @@ class AlrEnv(gym.Env):
        By default this returns the starting position.
        """
        return self.start_pos
    @property
    @abstractmethod
    def dt(self) -> Union[float, int]:
        """
        Returns the time between two simulated steps of the environment
        """
        raise NotImplementedError()
--- a/alr_envs/utils/mps/detpmp_wrapper.py
+++ b/alr_envs/utils/mps/detpmp_wrapper.py
@ -2,29 +2,36 @@ import gym
 import numpy as np
 from mp_lib import det_promp
-from alr_envs.utils.mps.mp_environments import AlrEnv
+from alr_envs.utils.mps.alr_env import AlrEnv
 from alr_envs.utils.mps.mp_wrapper import MPWrapper
 class DetPMPWrapper(MPWrapper):
-    def __init__(self, env: AlrEnv, num_dof: int, num_basis: int, width: float, duration: float = 1, dt: float = 0.01,
+    def __init__(self, env: AlrEnv, num_dof, num_basis, width, start_pos=None, duration=1, post_traj_time=0.,
-                 post_traj_time: float = 0., policy_type: str = None, weights_scale: float = 1.,
+                 policy_type=None, weights_scale=1, zero_start=False, zero_goal=False, learn_mp_length: bool =True,
-                 zero_start: bool = False, zero_goal: bool = False, **mp_kwargs):
+                 **mp_kwargs):
        self.duration = duration  # seconds
-        dt = env.dt if hasattr(env, "dt") else dt
+        super().__init__(env=env, num_dof=num_dof, duration=duration, post_traj_time=post_traj_time,
-        assert dt is not None
+                         policy_type=policy_type, weights_scale=weights_scale, num_basis=num_basis,
-        self.dt = dt
+                         width=width, zero_start=zero_start, zero_goal=zero_goal,
                         **mp_kwargs)
-        super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, num_basis=num_basis,
+        self.learn_mp_length = learn_mp_length
-                         width=width, zero_start=zero_start, zero_goal=zero_goal, **mp_kwargs)
+        if self.learn_mp_length:
            parameter_space_shape = (1+num_basis*num_dof,)
        else:
            parameter_space_shape = (num_basis * num_dof,)
        self.min_param = -np.inf
        self.max_param = np.inf
        self.parameterization_space = gym.spaces.Box(low=self.min_param, high=self.max_param,
                                                     shape=parameter_space_shape, dtype=np.float32)
-        action_bounds = np.inf * np.ones((self.mp.n_basis * self.mp.n_dof))
+        self.start_pos = start_pos
        self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32)
-    def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, width: float = None,
+    def initialize_mp(self, num_dof: int, duration: int, num_basis: int = 5, width: float = None,
-                      off: float = 0.01, zero_start: bool = False, zero_goal: bool = False):
+                      zero_start: bool = False, zero_goal: bool = False, **kwargs):
-        pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=off,
+        pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=0.01,
                                           zero_start=zero_start, zero_goal=zero_goal)
        weights = np.zeros(shape=(num_basis, num_dof))
@ -33,10 +40,15 @@ class DetPMPWrapper(MPWrapper):
        return pmp
    def mp_rollout(self, action):
-        params = np.reshape(action, newshape=(self.mp.n_basis, self.mp.n_dof)) * self.weights_scale
+        if self.learn_mp_length:
-        self.mp.set_weights(self.duration, params)
+            duration = max(1, self.duration*abs(action[0]))
-        _, des_pos, des_vel, _ = self.mp.compute_trajectory(1 / self.dt, 1.)
+            params = np.reshape(action[1:], (self.mp.n_basis, -1)) * self.weights_scale # TODO: Fix Bug when zero_start is true
        else:
            duration = self.duration
            params = np.reshape(action, (self.mp.n_basis, -1)) * self.weights_scale # TODO: Fix Bug when zero_start is true
        self.mp.set_weights(1., params)
        _, des_pos, des_vel, _ = self.mp.compute_trajectory(frequency=max(1, duration))
        if self.mp.zero_start:
-            des_pos += self.env.start_pos[None, :]
+            des_pos += self.start_pos
        return des_pos, des_vel
--- a/alr_envs/utils/mps/dmp_wrapper.py
+++ b/alr_envs/utils/mps/dmp_wrapper.py
@ -4,7 +4,7 @@ from mp_lib import dmps
 from mp_lib.basis import DMPBasisGenerator
 from mp_lib.phase import ExpDecayPhaseGenerator
-from alr_envs.utils.mps.mp_environments import AlrEnv
+from alr_envs.utils.mps.alr_env import AlrEnv
 from alr_envs.utils.mps.mp_wrapper import MPWrapper
@ -32,26 +32,26 @@ class DmpWrapper(MPWrapper):
            goal_scale:
        """
        self.learn_goal = learn_goal
-        dt = env.dt if hasattr(env, "dt") else dt
+
        assert dt is not None
        self.t = np.linspace(0, duration, int(duration / dt))
        self.goal_scale = goal_scale
-        super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, render_mode,
+        super().__init__(env=env, num_dof=num_dof, duration=duration, post_traj_time=post_traj_time,
                         policy_type=policy_type, weights_scale=weights_scale, render_mode=render_mode,
                         num_basis=num_basis, alpha_phase=alpha_phase, bandwidth_factor=bandwidth_factor)
        action_bounds = np.inf * np.ones((np.prod(self.mp.weights.shape) + (num_dof if learn_goal else 0)))
        self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32)
-    def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, alpha_phase: float = 2.,
+    def initialize_mp(self, num_dof: int, duration: int, num_basis: int, alpha_phase: float = 2.,
-                      bandwidth_factor: int = 3):
+                      bandwidth_factor: int = 3, **kwargs):
        phase_generator = ExpDecayPhaseGenerator(alpha_phase=alpha_phase, duration=duration)
        basis_generator = DMPBasisGenerator(phase_generator, duration=duration, num_basis=num_basis,
                                            basis_bandwidth_factor=bandwidth_factor)
        dmp = dmps.DMP(num_dof=num_dof, basis_generator=basis_generator, phase_generator=phase_generator,
-                       duration=duration, dt=dt)
+                       dt=self.dt)
        return dmp
--- a/alr_envs/utils/mps/mp_wrapper.py
+++ b/alr_envs/utils/mps/mp_wrapper.py
@ -1,16 +1,36 @@
 from abc import ABC, abstractmethod
 from typing import Union
 import gym
 import numpy as np
-from alr_envs.utils.mps.mp_environments import AlrEnv
+from alr_envs.utils.mps.alr_env import AlrEnv
-from alr_envs.utils.policies import get_policy_class
+from alr_envs.utils.policies import get_policy_class, BaseController
 class MPWrapper(gym.Wrapper, ABC):
    """
    Base class for movement primitive based gym.Wrapper implementations.
-    def __init__(self, env: AlrEnv, num_dof: int, dt: float, duration: float = 1, post_traj_time: float = 0.,
+    :param env: The (wrapped) environment this wrapper is applied on
-                 policy_type: str = None, weights_scale: float = 1., render_mode: str = None, **mp_kwargs):
+    :param num_dof: Dimension of the action space of the wrapped env
    :param duration: Number of timesteps in the trajectory of the movement primitive
    :param post_traj_time: Time for which the last position of the trajectory is fed to the environment to continue
    simulation
    :param policy_type: Type or object defining the policy that is used to generate action based on the trajectory
    :param weight_scale: Scaling parameter for the actions given to this wrapper
    :param render_mode: Equivalent to gym render mode
    """
    def __init__(self,
                 env: AlrEnv,
                 num_dof: int,
                 duration: int = 1,
                 post_traj_time: float = 0.,
                 policy_type: Union[str, BaseController] = None,
                 weights_scale: float = 1.,
                 render_mode: str = None,
                 **mp_kwargs
                 ):
        super().__init__(env)
        # adjust observation space to reduce version
@ -19,14 +39,15 @@ class MPWrapper(gym.Wrapper, ABC):
                                                high=obs_sp.high[self.env.active_obs],
                                                dtype=obs_sp.dtype)
-        assert dt is not None  # this should never happen as MPWrapper is a base class
+        self.post_traj_steps = int(post_traj_time / env.dt)
        self.post_traj_steps = int(post_traj_time / dt)
-        self.mp = self.initialize_mp(num_dof, duration, dt, **mp_kwargs)
+        self.mp = self.initialize_mp(num_dof=num_dof, duration=duration, **mp_kwargs)
        self.weights_scale = weights_scale
-        policy_class = get_policy_class(policy_type)
+        if type(policy_type) is str:
-        self.policy = policy_class(env)
+            self.policy = get_policy_class(policy_type, env, mp_kwargs)
        else:
            self.policy = policy_type
        # rendering
        self.render_mode = render_mode
@ -62,29 +83,30 @@ class MPWrapper(gym.Wrapper, ABC):
        if self.post_traj_steps > 0:
            trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
-            velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.n_dof))])
+            velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dimensions))])
        # self._trajectory = trajectory
        # self._velocity = velocity
        rewards = 0
        info = {}
        # create random obs as the reset function is called externally
        obs = self.env.observation_space.sample()
        trajectory_length = len(trajectory)
        actions = np.zeros(shape=(trajectory_length, self.mp.num_dimensions))
        observations= np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape)
        rewards = np.zeros(shape=(trajectory_length,))
        trajectory_return = 0
        infos = dict(step_infos =[])
        for t, pos_vel in enumerate(zip(trajectory, velocity)):
-            ac = self.policy.get_action(pos_vel[0], pos_vel[1])
+            actions[t,:] = self.policy.get_action(pos_vel[0], pos_vel[1])
-            obs, rew, done, info = self.env.step(ac)
+            observations[t,:], rewards[t], done, info = self.env.step(actions[t,:])
-            rewards += rew
+            trajectory_return += rewards[t]
-            # TODO return all dicts?
+            infos['step_infos'].append(info)
            # [infos[k].append(v) for k, v in info.items()]
            if self.render_mode:
                self.env.render(mode=self.render_mode, **self.render_kwargs)
            if done:
                break
        infos['step_actions'] = actions[:t+1]
        infos['step_observations'] = observations[:t+1]
        infos['step_rewards'] = rewards[:t+1]
        infos['trajectory_length'] = t+1
        done = True
-        return obs[self.env.active_obs], rewards, done, info
+        return observations[t][self.env.active_obs], trajectory_return, done, infos
    def render(self, mode='human', **kwargs):
        """Only set render options here, such that they can be used during the rollout.
@ -102,7 +124,7 @@ class MPWrapper(gym.Wrapper, ABC):
        raise NotImplementedError()
    @abstractmethod
-    def initialize_mp(self, num_dof: int, duration: float, dt: float, **kwargs):
+    def initialize_mp(self, num_dof: int, duration: float, **kwargs):
        """
        Create respective instance of MP
        Returns:
--- a/alr_envs/utils/policies.py
+++ b/alr_envs/utils/policies.py
@ -1,10 +1,12 @@
 from typing import Tuple, Union
 from gym import Env
-from alr_envs.mujoco.alr_mujoco_env import AlrMujocoEnv
+from alr_envs.utils.positional_env import PositionalEnv
 class BaseController:
-    def __init__(self, env: Env):
+    def __init__(self, env: Env, **kwargs):
        self.env = env
    def get_action(self, des_pos, des_vel):
@ -22,27 +24,38 @@ class VelController(BaseController):
 class PDController(BaseController):
-    def __init__(self, env: AlrMujocoEnv):
+    """
-        self.p_gains = env.p_gains
+    A PD-Controller. Using position and velocity information from a provided positional environment,
-        self.d_gains = env.d_gains
+    the controller calculates a response based on the desired position and velocity
-        super(PDController, self).__init__(env)
+
    :param env: A position environment
    :param p_gains: Factors for the proportional gains
    :param d_gains: Factors for the differential gains
    """
    def __init__(self,
                 env: PositionalEnv,
                 p_gains: Union[float, Tuple],
                 d_gains: Union[float, Tuple]):
        self.p_gains = p_gains
        self.d_gains = d_gains
        super(PDController, self).__init__(env, )
    def get_action(self, des_pos, des_vel):
        # TODO: make standardized ALRenv such that all of them have current_pos/vel attributes
        cur_pos = self.env.current_pos
        cur_vel = self.env.current_vel
-        if len(des_pos) != len(cur_pos):
+        assert des_pos.shape != cur_pos.shape, \
-            des_pos = self.env.extend_des_pos(des_pos)
+            "Mismatch in dimension between desired position {} and current position {}".format(des_pos.shape, cur_pos.shape)
-        if len(des_vel) != len(cur_vel):
+        assert des_vel.shape != cur_vel.shape, \
-            des_vel = self.env.extend_des_vel(des_vel)
+            "Mismatch in dimension between desired velocity {} and current velocity {}".format(des_vel.shape,
                                                                                               cur_vel.shape)
        trq = self.p_gains * (des_pos - cur_pos) + self.d_gains * (des_vel - cur_vel)
        return trq
-def get_policy_class(policy_type):
+def get_policy_class(policy_type, env, mp_kwargs, **kwargs):
    if policy_type == "motor":
-        return PDController
+        return PDController(env, p_gains=mp_kwargs['p_gains'], d_gains=mp_kwargs['d_gains'])
    elif policy_type == "velocity":
-        return VelController
+        return VelController(env)
    elif policy_type == "position":
-        return PosController
+        return PosController(env)
--- a/alr_envs/utils/positional_env.py
+++ b/alr_envs/utils/positional_env.py
@ -0,0 +1,22 @@
 from abc import abstractmethod
 from typing import Union, Tuple
 import numpy as np
 from gym import Env
 class PositionalEnv(Env):
    """A position and velocity based environment. It functions just as any regular OpenAI Gym
    environment but it provides position, velocity and acceleration information. This usually means that the
    corresponding information from the agent is forwarded via the properties.
    PD-Controller based policies require this environment to calculate the state dependent actions for example.
    """
    @property
    @abstractmethod
    def current_pos(self) -> Union[float, int, np.ndarray, Tuple]:
        raise NotImplementedError
    @property
    @abstractmethod
    def current_vel(self) -> Union[float, int, np.ndarray, Tuple]:
        raise NotImplementedError