fancy_gym/alr_envs/utils/mps/mp_wrapper.py

from abc import ABC, abstractmethod
from typing import Union

import gym
import numpy as np

from alr_envs.utils.mps.alr_env import AlrEnv
from alr_envs.utils.policies import get_policy_class, BaseController


class MPWrapper(gym.Wrapper, ABC):
    """
    Base class for movement primitive based gym.Wrapper implementations.

    :param env: The (wrapped) environment this wrapper is applied on
    :param num_dof: Dimension of the action space of the wrapped env
    :param duration: Number of timesteps in the trajectory of the movement primitive
    :param post_traj_time: Time for which the last position of the trajectory is fed to the environment to continue
    simulation
    :param policy_type: Type or object defining the policy that is used to generate action based on the trajectory
    :param weight_scale: Scaling parameter for the actions given to this wrapper
    :param render_mode: Equivalent to gym render mode
    """
    def __init__(self,
                 env: AlrEnv,
                 num_dof: int,
                 duration: int = 1,
                 post_traj_time: float = 0.,
                 policy_type: Union[str, BaseController] = None,
                 weights_scale: float = 1.,
                 render_mode: str = None,
                 **mp_kwargs
                 ):
        super().__init__(env)

        # adjust observation space to reduce version
        obs_sp = self.env.observation_space
        self.observation_space = gym.spaces.Box(low=obs_sp.low[self.env.active_obs],
                                                high=obs_sp.high[self.env.active_obs],
                                                dtype=obs_sp.dtype)

        self.post_traj_steps = int(post_traj_time / env.dt)

        self.mp = self.initialize_mp(num_dof=num_dof, duration=duration, **mp_kwargs)
        self.weights_scale = weights_scale

        if type(policy_type) is str:
            self.policy = get_policy_class(policy_type, env, mp_kwargs)
        else:
            self.policy = policy_type

        # rendering
        self.render_mode = render_mode
        self.render_kwargs = {}

    # TODO: @Max I think this should not be in this class, this functionality should be part of your sampler.
    def __call__(self, params, contexts=None):
        """
        Can be used to provide a batch of parameter sets
        """
        params = np.atleast_2d(params)
        obs = []
        rewards = []
        dones = []
        infos = []
        # for p, c in zip(params, contexts):
        for p in params:
            # self.configure(c)
            ob, reward, done, info = self.step(p)
            obs.append(ob)
            rewards.append(reward)
            dones.append(done)
            infos.append(info)

        return obs, np.array(rewards), dones, infos

    def reset(self):
        return self.env.reset()[self.env.active_obs]

    def step(self, action: np.ndarray):
        """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step"""
        trajectory, velocity = self.mp_rollout(action)

        if self.post_traj_steps > 0:
            trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
            velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dimensions))])

        trajectory_length = len(trajectory)
        actions = np.zeros(shape=(trajectory_length, self.mp.num_dimensions))
        observations= np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape)
        rewards = np.zeros(shape=(trajectory_length,))
        trajectory_return = 0
        infos = dict(step_infos =[])
        for t, pos_vel in enumerate(zip(trajectory, velocity)):
            actions[t,:] = self.policy.get_action(pos_vel[0], pos_vel[1])
            observations[t,:], rewards[t], done, info = self.env.step(actions[t,:])
            trajectory_return += rewards[t]
            infos['step_infos'].append(info)
            if self.render_mode:
                self.env.render(mode=self.render_mode, **self.render_kwargs)
            if done:
                break

        infos['step_actions'] = actions[:t+1]
        infos['step_observations'] = observations[:t+1]
        infos['step_rewards'] = rewards[:t+1]
        infos['trajectory_length'] = t+1
        done = True
        return observations[t][self.env.active_obs], trajectory_return, done, infos

    def render(self, mode='human', **kwargs):
        """Only set render options here, such that they can be used during the rollout.
        This only needs to be called once"""
        self.render_mode = mode
        self.render_kwargs = kwargs

    @abstractmethod
    def mp_rollout(self, action):
        """
        Generate trajectory and velocity based on the MP
        Returns:
            trajectory/positions, velocity
        """
        raise NotImplementedError()

    @abstractmethod
    def initialize_mp(self, num_dof: int, duration: float, **kwargs):
        """
        Create respective instance of MP
        Returns:
             MP instance
        """

        raise NotImplementedError
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`from abc import ABC, abstractmethod`
Add interface for envs controlable by a PD Controller and add more infos to mp_wrapper info return value 2021-05-11 06:19:30 +02:00			`from typing import Union`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00
			`import gym`
			`import numpy as np`

Add interface for envs controlable by a PD Controller and add more infos to mp_wrapper info return value 2021-05-11 06:19:30 +02:00			`from alr_envs.utils.mps.alr_env import AlrEnv`
			`from alr_envs.utils.policies import get_policy_class, BaseController`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00

			`class MPWrapper(gym.Wrapper, ABC):`
Add interface for envs controlable by a PD Controller and add more infos to mp_wrapper info return value 2021-05-11 06:19:30 +02:00			`"""`
			`Base class for movement primitive based gym.Wrapper implementations.`

			`:param env: The (wrapped) environment this wrapper is applied on`
			`:param num_dof: Dimension of the action space of the wrapped env`
			`:param duration: Number of timesteps in the trajectory of the movement primitive`
			`:param post_traj_time: Time for which the last position of the trajectory is fed to the environment to continue`
			`simulation`
			`:param policy_type: Type or object defining the policy that is used to generate action based on the trajectory`
			`:param weight_scale: Scaling parameter for the actions given to this wrapper`
			`:param render_mode: Equivalent to gym render mode`
			`"""`
			`def __init__(self,`
			`env: AlrEnv,`
			`num_dof: int,`
			`duration: int = 1,`
			`post_traj_time: float = 0.,`
			`policy_type: Union[str, BaseController] = None,`
			`weights_scale: float = 1.,`
			`render_mode: str = None,`
			`**mp_kwargs`
			`):`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`super().__init__(env)`

adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`# adjust observation space to reduce version`
			`obs_sp = self.env.observation_space`
			`self.observation_space = gym.spaces.Box(low=obs_sp.low[self.env.active_obs],`
			`high=obs_sp.high[self.env.active_obs],`
			`dtype=obs_sp.dtype)`

Add interface for envs controlable by a PD Controller and add more infos to mp_wrapper info return value 2021-05-11 06:19:30 +02:00			`self.post_traj_steps = int(post_traj_time / env.dt)`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00
Add interface for envs controlable by a PD Controller and add more infos to mp_wrapper info return value 2021-05-11 06:19:30 +02:00			`self.mp = self.initialize_mp(num_dof=num_dof, duration=duration, **mp_kwargs)`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`self.weights_scale = weights_scale`

Add interface for envs controlable by a PD Controller and add more infos to mp_wrapper info return value 2021-05-11 06:19:30 +02:00			`if type(policy_type) is str:`
			`self.policy = get_policy_class(policy_type, env, mp_kwargs)`
			`else:`
			`self.policy = policy_type`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00
			`# rendering`
add render_mode option to mp_wrapper 2021-04-30 16:10:16 +02:00			`self.render_mode = render_mode`
fix rendering 2021-04-30 16:22:33 +02:00			`self.render_kwargs = {}`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00
added MPEnv 2021-05-12 09:52:25 +02:00			`# TODO: @Max I think this should not be in this class, this functionality should be part of your sampler.`
wip 2021-04-21 10:45:34 +02:00			`def __call__(self, params, contexts=None):`
added MPEnv 2021-05-12 09:52:25 +02:00			`"""`
			`Can be used to provide a batch of parameter sets`
			`"""`
wip 2021-04-21 10:45:34 +02:00			`params = np.atleast_2d(params)`
			`obs = []`
			`rewards = []`
			`dones = []`
			`infos = []`
updates 2021-04-23 11:37:42 +02:00			`# for p, c in zip(params, contexts):`
			`for p in params:`
			`# self.configure(c)`
wip 2021-04-21 10:45:34 +02:00			`ob, reward, done, info = self.step(p)`
			`obs.append(ob)`
			`rewards.append(reward)`
			`dones.append(done)`
			`infos.append(info)`

			`return obs, np.array(rewards), dones, infos`

start contextual dmp wrapper 2021-05-07 09:51:53 +02:00			`def reset(self):`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`return self.env.reset()[self.env.active_obs]`
start contextual dmp wrapper 2021-05-07 09:51:53 +02:00
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`def step(self, action: np.ndarray):`
			`""" This function generates a trajectory based on a DMP and then does the usual loop over reset and step"""`
			`trajectory, velocity = self.mp_rollout(action)`

			`if self.post_traj_steps > 0:`
			`trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])`
Add interface for envs controlable by a PD Controller and add more infos to mp_wrapper info return value 2021-05-11 06:19:30 +02:00			`velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dimensions))])`

			`trajectory_length = len(trajectory)`
			`actions = np.zeros(shape=(trajectory_length, self.mp.num_dimensions))`
			`observations= np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape)`
			`rewards = np.zeros(shape=(trajectory_length,))`
			`trajectory_return = 0`
			`infos = dict(step_infos =[])`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`for t, pos_vel in enumerate(zip(trajectory, velocity)):`
Add interface for envs controlable by a PD Controller and add more infos to mp_wrapper info return value 2021-05-11 06:19:30 +02:00			`actions[t,:] = self.policy.get_action(pos_vel[0], pos_vel[1])`
			`observations[t,:], rewards[t], done, info = self.env.step(actions[t,:])`
			`trajectory_return += rewards[t]`
			`infos['step_infos'].append(info)`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`if self.render_mode:`
			`self.env.render(mode=self.render_mode, **self.render_kwargs)`
			`if done:`
			`break`

Add interface for envs controlable by a PD Controller and add more infos to mp_wrapper info return value 2021-05-11 06:19:30 +02:00			`infos['step_actions'] = actions[:t+1]`
			`infos['step_observations'] = observations[:t+1]`
			`infos['step_rewards'] = rewards[:t+1]`
			`infos['trajectory_length'] = t+1`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`done = True`
Add interface for envs controlable by a PD Controller and add more infos to mp_wrapper info return value 2021-05-11 06:19:30 +02:00			`return observations[t][self.env.active_obs], trajectory_return, done, infos`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00
			`def render(self, mode='human', **kwargs):`
			`"""Only set render options here, such that they can be used during the rollout.`
			`This only needs to be called once"""`
			`self.render_mode = mode`
			`self.render_kwargs = kwargs`

			`@abstractmethod`
			`def mp_rollout(self, action):`
			`"""`
			`Generate trajectory and velocity based on the MP`
			`Returns:`
			`trajectory/positions, velocity`
			`"""`
			`raise NotImplementedError()`

			`@abstractmethod`
Add interface for envs controlable by a PD Controller and add more infos to mp_wrapper info return value 2021-05-11 06:19:30 +02:00			`def initialize_mp(self, num_dof: int, duration: float, **kwargs):`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`"""`
			`Create respective instance of MP`
			`Returns:`
			`MP instance`
			`"""`

			`raise NotImplementedError`