start integrating mp_pytorch lib

2022-04-28 09:05:28 +02:00 · 2022-04-28 09:05:28 +02:00 · cd33e82d3c
commit cd33e82d3c
parent 7f64c975cd
5 changed files with 350 additions and 0 deletions
--- a/alr_envs/alr/init.py
+++ b/alr_envs/alr/init.py
@ -825,3 +825,10 @@ for _v in _versions:
        }
    )
    ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
 # --------------------- Testing new mp wrapper -----------------------------------------------------
 # register(
 #     id='ALRReacherProMP-v0'
 # )
--- a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py
+++ b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py
@ -0,0 +1,20 @@
 from mp_wrapper import BaseMPWrapper
 from typing import Union, Tuple
 import numpy as np
 class MPWrapper(BaseMPWrapper):
    def current_pos(self) -> Union[float, int, np.ndarray, Tuple]:
        return self.env.sim.data.qpos[0:7].copy()
    def current_vel(self) -> Union[float, int, np.ndarray, Tuple]:
        return self.env.sim.data.qvel[0:7].copy()
    def set_active_obs(self):
        return np.hstack([
            [False] * 7,  # cos
            [False] * 7,  # sin
            [True] * 2,  # xy position of cup
            [False]  # env steps
        ])
--- a/alr_envs/alr/mujoco/reacher/new_mp_wrapper.py
+++ b/alr_envs/alr/mujoco/reacher/new_mp_wrapper.py
@ -0,0 +1,24 @@
 from mp_wrapper import BaseMPWrapper
 from typing import Union, Tuple
 import numpy as np
 class MPWrapper(BaseMPWrapper):
    @property
    def current_pos(self) -> Union[float, int, np.ndarray, Tuple]:
        return self.env.sim.data.qpos.flat[:self.env.n_links]
    @property
    def current_vel(self) -> Union[float, int, np.ndarray, Tuple]:
        return self.env.sim.data.qvel.flat[:self.env.n_links]
    def set_active_obs(self):
        return np.concatenate([
            [False] * self.env.n_links,  # cos
            [False] * self.env.n_links,  # sin
            [True] * 2,  # goal position
            [False] * self.env.n_links,  # angular velocity
            [False] * 3,  # goal distance
            # self.get_body_com("target"),  # only return target to make problem harder
            [False],  # step
        ])
--- a/mp_wrapper.py
+++ b/mp_wrapper.py
@ -0,0 +1,170 @@
 from abc import ABC, abstractmethod
 from typing import Union, Tuple
 import gym
 import numpy as np
 from gym import spaces
 from gym.envs.mujoco import MujocoEnv
 from policies import get_policy_class, BaseController
 from mp_pytorch.mp.mp_interfaces import MPInterface
 class BaseMPWrapper(gym.Env, ABC):
    """
    Base class for movement primitive based gym.Wrapper implementations.
    Args:
        env: The (wrapped) environment this wrapper is applied on
        num_dof: Dimension of the action space of the wrapped env
        num_basis: Number of basis functions per dof
        duration: Length of the trajectory of the movement primitive in seconds
        post_traj_time: Time for which the last position of the trajectory is fed to the environment to continue
        simulation in seconds
        policy_type: Type or object defining the policy that is used to generate action based on the trajectory
        weight_scale: Scaling parameter for the actions given to this wrapper
        render_mode: Equivalent to gym render mode
    """
    def __init__(self,
                 env: MujocoEnv,
                 mp: MPInterface,
                 duration: float,
                 policy_type: Union[str, BaseController] = None,
                 render_mode: str = None,
                 **mp_kwargs
                 ):
        super().__init__()
        assert env.dt is not None
        self.env = env
        self.dt = env.dt
        self.duration = duration
        self.traj_steps = int(duration / self.dt)
        self.post_traj_steps = self.env.spec.max_episode_steps - self.traj_steps
        if isinstance(policy_type, str):
            # pop policy kwargs here such that they are not passed to the initialize_mp method
            self.policy = get_policy_class(policy_type, self, **mp_kwargs.pop('policy_kwargs', {}))
        else:
            self.policy = policy_type
        self.mp = mp
        self.env = env
        # rendering
        self.render_mode = render_mode
        self.render_kwargs = {}
        self.time_steps = np.linspace(0, self.duration, self.traj_steps + 1)
        self.mp.set_mp_times(self.time_steps)
        # TODO: put action bounds in mp wrapper (e.g. time bound for traj. length ...), otherwis learning the durations
        #  might not work
        # action_bounds = np.inf * np.ones((np.prod(self.mp.num_params)))
        min_action_bounds, max_action_bounds = mp.get_param_bounds()
        self.action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(),
                                           dtype=np.float32)
        self.active_obs = self.set_active_obs()
        self.observation_space = spaces.Box(low=self.env.observation_space.low[self.active_obs],
                                            high=self.env.observation_space.high[self.active_obs],
                                            dtype=self.env.observation_space.dtype)
    def get_trajectory(self, action: np.ndarray) -> Tuple:
        self.mp.set_params(action)
        traj_dict = self.mp.get_mp_trajs(get_pos = True, get_vel = True)
        trajectory_tensor, velocity_tensor = traj_dict['pos'], traj_dict['vel']
        trajectory = trajectory_tensor.numpy()
        velocity = velocity_tensor.numpy()
        if self.post_traj_steps > 0:
            trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
            velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dof))])
        return trajectory, velocity
    @abstractmethod
    def set_active_obs(self):
        pass
    @property
    @abstractmethod
    def current_pos(self) -> Union[float, int, np.ndarray, Tuple]:
        """
            Returns the current position of the action/control dimension.
            The dimensionality has to match the action/control dimension.
            This is not required when exclusively using velocity control,
            it should, however, be implemented regardless.
            E.g. The joint positions that are directly or indirectly controlled by the action.
        """
        raise NotImplementedError()
    @property
    @abstractmethod
    def current_vel(self) -> Union[float, int, np.ndarray, Tuple]:
        """
            Returns the current velocity of the action/control dimension.
            The dimensionality has to match the action/control dimension.
            This is not required when exclusively using position control,
            it should, however, be implemented regardless.
            E.g. The joint velocities that are directly or indirectly controlled by the action.
        """
        raise NotImplementedError()
    def step(self, action: np.ndarray):
        """ This function generates a trajectory based on a MP and then does the usual loop over reset and step"""
        trajectory, velocity = self.get_trajectory(action)
        trajectory_length = len(trajectory)
        actions = np.zeros(shape=(trajectory_length,) + self.env.action_space.shape)
        if isinstance(self.env.observation_space, spaces.Dict):  # For goal environments
            observations = np.zeros(shape=(trajectory_length,) + self.env.observation_space["observation"].shape,
                                    dtype=self.env.observation_space.dtype)
        else:
            observations = np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape,
                                    dtype=self.env.observation_space.dtype)
        rewards = np.zeros(shape=(trajectory_length,))
        trajectory_return = 0
        infos = dict()
        for t, pos_vel in enumerate(zip(trajectory, velocity)):
            ac = self.policy.get_action(pos_vel[0], pos_vel[1])
            actions[t, :] = np.clip(ac, self.env.action_space.low, self.env.action_space.high)
            obs, rewards[t], done, info = self.env.step(actions[t, :])
            observations[t, :] = obs["observation"] if isinstance(self.env.observation_space, spaces.Dict) else obs
            trajectory_return += rewards[t]
            for k, v in info.items():
                elems = infos.get(k, [None] * trajectory_length)
                elems[t] = v
                infos[k] = elems
            # infos['step_infos'].append(info)
            if self.render_mode:
                self.render(mode=self.render_mode, **self.render_kwargs)
            if done:
                break
        infos.update({k: v[:t + 1] for k, v in infos.items()})
        infos['trajectory'] = trajectory
        infos['step_actions'] = actions[:t + 1]
        infos['step_observations'] = observations[:t + 1]
        infos['step_rewards'] = rewards[:t + 1]
        infos['trajectory_length'] = t + 1
        done = True
        return self.get_observation_from_step(observations[t]), trajectory_return, done, infos
    def reset(self):
        return self.get_observation_from_step(self.env.reset())
    def render(self, mode='human', **kwargs):
        """Only set render options here, such that they can be used during the rollout.
        This only needs to be called once"""
        self.render_mode = mode
        self.render_kwargs = kwargs
        # self.env.render(mode=self.render_mode, **self.render_kwargs)
        self.env.render(mode=self.render_mode)
    def get_observation_from_step(self, observation: np.ndarray) -> np.ndarray:
        return observation[self.active_obs]
--- a/policies.py
+++ b/policies.py
@ -0,0 +1,129 @@
 from typing import Tuple, Union
 import numpy as np
 class BaseController:
    def __init__(self, env, **kwargs):
        self.env = env
    def get_action(self, des_pos, des_vel):
        raise NotImplementedError
 class PosController(BaseController):
    """
    A Position Controller. The controller calculates a response only based on the desired position.
    """
    def get_action(self, des_pos, des_vel):
        return des_pos
 class VelController(BaseController):
    """
    A Velocity Controller. The controller calculates a response only based on the desired velocity.
    """
    def get_action(self, des_pos, des_vel):
        return des_vel
 class PDController(BaseController):
    """
    A PD-Controller. Using position and velocity information from a provided environment,
    the controller calculates a response based on the desired position and velocity
    :param env: A position environment
    :param p_gains: Factors for the proportional gains
    :param d_gains: Factors for the differential gains
    """
    def __init__(self,
                 env,
                 p_gains: Union[float, Tuple],
                 d_gains: Union[float, Tuple]):
        self.p_gains = p_gains
        self.d_gains = d_gains
        super(PDController, self).__init__(env)
    def get_action(self, des_pos, des_vel):
        cur_pos = self.env.current_pos
        cur_vel = self.env.current_vel
        assert des_pos.shape == cur_pos.shape, \
            f"Mismatch in dimension between desired position {des_pos.shape} and current position {cur_pos.shape}"
        assert des_vel.shape == cur_vel.shape, \
            f"Mismatch in dimension between desired velocity {des_vel.shape} and current velocity {cur_vel.shape}"
        trq = self.p_gains * (des_pos - cur_pos) + self.d_gains * (des_vel - cur_vel)
        return trq
 class MetaWorldController(BaseController):
    """
    A Metaworld Controller. Using position and velocity information from a provided environment,
    the controller calculates a response based on the desired position and velocity.
    Unlike the other Controllers, this is a special controller for MetaWorld environments.
    They use a position delta for the xyz coordinates and a raw position for the gripper opening.
    :param env: A position environment
    """
    def __init__(self,
                 env
                 ):
        super(MetaWorldController, self).__init__(env)
    def get_action(self, des_pos, des_vel):
        gripper_pos = des_pos[-1]
        cur_pos = self.env.current_pos[:-1]
        xyz_pos = des_pos[:-1]
        assert xyz_pos.shape == cur_pos.shape, \
            f"Mismatch in dimension between desired position {xyz_pos.shape} and current position {cur_pos.shape}"
        trq = np.hstack([(xyz_pos - cur_pos), gripper_pos])
        return trq
 #TODO: Do we need this class?
 class PDControllerExtend(BaseController):
    """
    A PD-Controller. Using position and velocity information from a provided positional environment,
    the controller calculates a response based on the desired position and velocity
    :param env: A position environment
    :param p_gains: Factors for the proportional gains
    :param d_gains: Factors for the differential gains
    """
    def __init__(self,
                 env,
                 p_gains: Union[float, Tuple],
                 d_gains: Union[float, Tuple]):
        self.p_gains = p_gains
        self.d_gains = d_gains
        super(PDControllerExtend, self).__init__(env)
    def get_action(self, des_pos, des_vel):
        cur_pos = self.env.current_pos
        cur_vel = self.env.current_vel
        if len(des_pos) != len(cur_pos):
            des_pos = self.env.extend_des_pos(des_pos)
        if len(des_vel) != len(cur_vel):
            des_vel = self.env.extend_des_vel(des_vel)
        trq = self.p_gains * (des_pos - cur_pos) + self.d_gains * (des_vel - cur_vel)
        return trq
 def get_policy_class(policy_type, env, **kwargs):
    if policy_type == "motor":
        return PDController(env, **kwargs)
    elif policy_type == "velocity":
        return VelController(env)
    elif policy_type == "position":
        return PosController(env)
    elif policy_type == "metaworld":
        return MetaWorldController(env)
    else:
        raise ValueError(f"Invalid controller type {policy_type} provided. Only 'motor', 'velocity', 'position'  "
                         f"and 'metaworld are currently supported controllers.")