fancy_gym/alr_envs/utils/mps/mp_wrapper.py

135 lines
5.0 KiB
Python
Raw Normal View History

from abc import ABC, abstractmethod
from typing import Union
import gym
import numpy as np
from alr_envs.utils.mps.alr_env import AlrEnv
from alr_envs.utils.policies import get_policy_class, BaseController
class MPWrapper(gym.Wrapper, ABC):
"""
Base class for movement primitive based gym.Wrapper implementations.
:param env: The (wrapped) environment this wrapper is applied on
:param num_dof: Dimension of the action space of the wrapped env
:param duration: Number of timesteps in the trajectory of the movement primitive
:param post_traj_time: Time for which the last position of the trajectory is fed to the environment to continue
simulation
:param policy_type: Type or object defining the policy that is used to generate action based on the trajectory
:param weight_scale: Scaling parameter for the actions given to this wrapper
:param render_mode: Equivalent to gym render mode
"""
def __init__(self,
env: AlrEnv,
num_dof: int,
duration: int = 1,
post_traj_time: float = 0.,
policy_type: Union[str, BaseController] = None,
weights_scale: float = 1.,
render_mode: str = None,
**mp_kwargs
):
super().__init__(env)
# adjust observation space to reduce version
obs_sp = self.env.observation_space
self.observation_space = gym.spaces.Box(low=obs_sp.low[self.env.active_obs],
high=obs_sp.high[self.env.active_obs],
dtype=obs_sp.dtype)
self.post_traj_steps = int(post_traj_time / env.dt)
self.mp = self.initialize_mp(num_dof=num_dof, duration=duration, **mp_kwargs)
self.weights_scale = weights_scale
if type(policy_type) is str:
self.policy = get_policy_class(policy_type, env, mp_kwargs)
else:
self.policy = policy_type
# rendering
2021-04-30 16:10:16 +02:00
self.render_mode = render_mode
2021-04-30 16:22:33 +02:00
self.render_kwargs = {}
2021-05-12 09:52:25 +02:00
# TODO: @Max I think this should not be in this class, this functionality should be part of your sampler.
2021-04-21 10:45:34 +02:00
def __call__(self, params, contexts=None):
2021-05-12 09:52:25 +02:00
"""
Can be used to provide a batch of parameter sets
"""
2021-04-21 10:45:34 +02:00
params = np.atleast_2d(params)
obs = []
rewards = []
dones = []
infos = []
2021-04-23 11:37:42 +02:00
# for p, c in zip(params, contexts):
for p in params:
# self.configure(c)
2021-04-21 10:45:34 +02:00
ob, reward, done, info = self.step(p)
obs.append(ob)
rewards.append(reward)
dones.append(done)
infos.append(info)
return obs, np.array(rewards), dones, infos
2021-05-07 09:51:53 +02:00
def reset(self):
return self.env.reset()[self.env.active_obs]
2021-05-07 09:51:53 +02:00
def step(self, action: np.ndarray):
""" This function generates a trajectory based on a DMP and then does the usual loop over reset and step"""
trajectory, velocity = self.mp_rollout(action)
if self.post_traj_steps > 0:
trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dimensions))])
trajectory_length = len(trajectory)
actions = np.zeros(shape=(trajectory_length, self.mp.num_dimensions))
observations= np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape)
rewards = np.zeros(shape=(trajectory_length,))
trajectory_return = 0
infos = dict(step_infos =[])
for t, pos_vel in enumerate(zip(trajectory, velocity)):
actions[t,:] = self.policy.get_action(pos_vel[0], pos_vel[1])
observations[t,:], rewards[t], done, info = self.env.step(actions[t,:])
trajectory_return += rewards[t]
infos['step_infos'].append(info)
if self.render_mode:
self.env.render(mode=self.render_mode, **self.render_kwargs)
if done:
break
infos['step_actions'] = actions[:t+1]
infos['step_observations'] = observations[:t+1]
infos['step_rewards'] = rewards[:t+1]
infos['trajectory_length'] = t+1
done = True
return observations[t][self.env.active_obs], trajectory_return, done, infos
def render(self, mode='human', **kwargs):
"""Only set render options here, such that they can be used during the rollout.
This only needs to be called once"""
self.render_mode = mode
self.render_kwargs = kwargs
@abstractmethod
def mp_rollout(self, action):
"""
Generate trajectory and velocity based on the MP
Returns:
trajectory/positions, velocity
"""
raise NotImplementedError()
@abstractmethod
def initialize_mp(self, num_dof: int, duration: float, **kwargs):
"""
Create respective instance of MP
Returns:
MP instance
"""
raise NotImplementedError