Add interface for envs controlable by a PD Controller and add more infos to mp_wrapper info return value

This commit is contained in:
Marcel 2021-05-11 06:19:30 +02:00
parent 746d408a76
commit 4279414656
12 changed files with 169 additions and 86 deletions

View File

@ -1,3 +1,4 @@
import numpy as np
from gym.envs.registration import register from gym.envs.registration import register
from alr_envs.stochastic_search.functions.f_rosenbrock import Rosenbrock from alr_envs.stochastic_search.functions.f_rosenbrock import Rosenbrock
@ -321,7 +322,9 @@ register(
"bandwidth_factor": 2.5, "bandwidth_factor": 2.5,
"policy_type": "motor", "policy_type": "motor",
"weights_scale": 100, "weights_scale": 100,
"return_to_start": True "return_to_start": True,
"p_gains": np.array([4. / 3., 2.4, 2.5, 5. / 3., 2., 2., 1.25]),
"d_gains": np.array([0.0466, 0.12, 0.125, 0.04166, 0.06, 0.06, 0.025])
} }
) )
@ -339,7 +342,9 @@ register(
"bandwidth_factor": 2.5, "bandwidth_factor": 2.5,
"policy_type": "motor", "policy_type": "motor",
"weights_scale": 100, "weights_scale": 100,
"return_to_start": True "return_to_start": True,
"p_gains": np.array([4. / 3., 2.4, 2.5, 5. / 3., 2., 2., 1.25]),
"d_gains": np.array([0.0466, 0.12, 0.125, 0.04166, 0.06, 0.06, 0.025])
} }
) )
@ -357,7 +362,9 @@ register(
"policy_type": "motor", "policy_type": "motor",
"weights_scale": 0.2, "weights_scale": 0.2,
"zero_start": True, "zero_start": True,
"zero_goal": True "zero_goal": True,
"p_gains": np.array([4./3., 2.4, 2.5, 5./3., 2., 2., 1.25]),
"d_gains": np.array([0.0466, 0.12, 0.125, 0.04166, 0.06, 0.06, 0.025])
} }
) )
@ -374,7 +381,9 @@ register(
"policy_type": "motor", "policy_type": "motor",
"weights_scale": 0.2, "weights_scale": 0.2,
"zero_start": True, "zero_start": True,
"zero_goal": True "zero_goal": True,
"p_gains": np.array([4./3., 2.4, 2.5, 5./3., 2., 2., 1.25]),
"d_gains": np.array([0.0466, 0.12, 0.125, 0.04166, 0.06, 0.06, 0.025])
} }
) )
@ -392,7 +401,9 @@ register(
"bandwidth_factor": 2.5, "bandwidth_factor": 2.5,
"policy_type": "motor", "policy_type": "motor",
"weights_scale": 50, "weights_scale": 50,
"goal_scale": 0.1 "goal_scale": 0.1,
"p_gains": np.array([4. / 3., 2.4, 2.5, 5. / 3., 2., 2., 1.25]),
"d_gains": np.array([0.0466, 0.12, 0.125, 0.04166, 0.06, 0.06, 0.025])
} }
) )

View File

@ -7,7 +7,7 @@ from gym.utils import seeding
from matplotlib import patches from matplotlib import patches
from alr_envs.classic_control.utils import check_self_collision from alr_envs.classic_control.utils import check_self_collision
from alr_envs.utils.mps.mp_environments import AlrEnv from alr_envs.utils.mps.alr_env import AlrEnv
class HoleReacherEnv(AlrEnv): class HoleReacherEnv(AlrEnv):

View File

@ -5,7 +5,7 @@ import numpy as np
from gym import spaces from gym import spaces
from gym.utils import seeding from gym.utils import seeding
from alr_envs.utils.mps.mp_environments import AlrEnv from alr_envs.utils.mps.alr_env import AlrEnv
class SimpleReacherEnv(AlrEnv): class SimpleReacherEnv(AlrEnv):

View File

@ -6,7 +6,7 @@ import numpy as np
from gym.utils import seeding from gym.utils import seeding
from alr_envs.classic_control.utils import check_self_collision from alr_envs.classic_control.utils import check_self_collision
from alr_envs.utils.mps.mp_environments import AlrEnv from alr_envs.utils.mps.alr_env import AlrEnv
class ViaPointReacher(AlrEnv): class ViaPointReacher(AlrEnv):

View File

@ -7,7 +7,9 @@ from gym import error, spaces
from gym.utils import seeding from gym.utils import seeding
import numpy as np import numpy as np
from os import path from os import path
import gym
from alr_envs.utils.mps.alr_env import AlrEnv
from alr_envs.utils.positional_env import PositionalEnv
try: try:
import mujoco_py import mujoco_py
@ -33,7 +35,7 @@ def convert_observation_to_space(observation):
return space return space
class AlrMujocoEnv(gym.Env): class AlrMujocoEnv(PositionalEnv, AlrEnv):
""" """
Superclass for all MuJoCo environments. Superclass for all MuJoCo environments.
""" """
@ -44,7 +46,7 @@ class AlrMujocoEnv(gym.Env):
Args: Args:
model_path: path to xml file model_path: path to xml file
n_substeps: how many steps mujoco does per call to env.step n_substeps: how many steps mujoco does per call to env.step
use_servo: use actuator defined in xml, use False for direct torque control apply_gravity_comp: Whether gravity compensation should be active
""" """
if model_path.startswith("/"): if model_path.startswith("/"):
fullpath = model_path fullpath = model_path
@ -73,10 +75,6 @@ class AlrMujocoEnv(gym.Env):
self._set_action_space() self._set_action_space()
# action = self.action_space.sample()
# observation, _reward, done, _info = self.step(action)
# assert not done
observation = self._get_obs() # TODO: is calling get_obs enough? should we call reset, or even step? observation = self._get_obs() # TODO: is calling get_obs enough? should we call reset, or even step?
self._set_observation_space(observation) self._set_observation_space(observation)
@ -204,7 +202,7 @@ class AlrMujocoEnv(gym.Env):
try: try:
self.sim.step() self.sim.step()
except mujoco_py.builder.MujocoException as e: except mujoco_py.builder.MujocoException:
error_in_sim = True error_in_sim = True
return error_in_sim return error_in_sim

View File

@ -16,8 +16,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
self._q_vel = [] self._q_vel = []
# self.weight_matrix_scale = 50 # self.weight_matrix_scale = 50
self.max_ctrl = np.array([150., 125., 40., 60., 5., 5., 2.]) self.max_ctrl = np.array([150., 125., 40., 60., 5., 5., 2.])
self.p_gains = 1 / self.max_ctrl * np.array([200, 300, 100, 100, 10, 10, 2.5])
self.d_gains = 1 / self.max_ctrl * np.array([7, 15, 5, 2.5, 0.3, 0.3, 0.05])
self.j_min = np.array([-2.6, -1.985, -2.8, -0.9, -4.55, -1.5707, -2.7]) self.j_min = np.array([-2.6, -1.985, -2.8, -0.9, -4.55, -1.5707, -2.7])
self.j_max = np.array([2.6, 1.985, 2.8, 3.14159, 1.25, 1.5707, 2.7]) self.j_max = np.array([2.6, 1.985, 2.8, 3.14159, 1.25, 1.5707, 2.7])

View File

@ -1,14 +1,13 @@
from abc import abstractmethod from abc import abstractmethod, ABC
from typing import Union from typing import Union
import gym import gym
import numpy as np import numpy as np
class AlrEnv(gym.Env): class AlrEnv(gym.Env, ABC):
@property @property
@abstractmethod
def active_obs(self): def active_obs(self):
"""Returns boolean mask for each observation entry """Returns boolean mask for each observation entry
whether the observation is returned for the contextual case or not. whether the observation is returned for the contextual case or not.
@ -31,3 +30,11 @@ class AlrEnv(gym.Env):
By default this returns the starting position. By default this returns the starting position.
""" """
return self.start_pos return self.start_pos
@property
@abstractmethod
def dt(self) -> Union[float, int]:
"""
Returns the time between two simulated steps of the environment
"""
raise NotImplementedError()

View File

@ -2,29 +2,36 @@ import gym
import numpy as np import numpy as np
from mp_lib import det_promp from mp_lib import det_promp
from alr_envs.utils.mps.mp_environments import AlrEnv from alr_envs.utils.mps.alr_env import AlrEnv
from alr_envs.utils.mps.mp_wrapper import MPWrapper from alr_envs.utils.mps.mp_wrapper import MPWrapper
class DetPMPWrapper(MPWrapper): class DetPMPWrapper(MPWrapper):
def __init__(self, env: AlrEnv, num_dof: int, num_basis: int, width: float, duration: float = 1, dt: float = 0.01, def __init__(self, env: AlrEnv, num_dof, num_basis, width, start_pos=None, duration=1, post_traj_time=0.,
post_traj_time: float = 0., policy_type: str = None, weights_scale: float = 1., policy_type=None, weights_scale=1, zero_start=False, zero_goal=False, learn_mp_length: bool =True,
zero_start: bool = False, zero_goal: bool = False, **mp_kwargs): **mp_kwargs):
self.duration = duration # seconds self.duration = duration # seconds
dt = env.dt if hasattr(env, "dt") else dt super().__init__(env=env, num_dof=num_dof, duration=duration, post_traj_time=post_traj_time,
assert dt is not None policy_type=policy_type, weights_scale=weights_scale, num_basis=num_basis,
self.dt = dt width=width, zero_start=zero_start, zero_goal=zero_goal,
**mp_kwargs)
super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, num_basis=num_basis, self.learn_mp_length = learn_mp_length
width=width, zero_start=zero_start, zero_goal=zero_goal, **mp_kwargs) if self.learn_mp_length:
parameter_space_shape = (1+num_basis*num_dof,)
else:
parameter_space_shape = (num_basis * num_dof,)
self.min_param = -np.inf
self.max_param = np.inf
self.parameterization_space = gym.spaces.Box(low=self.min_param, high=self.max_param,
shape=parameter_space_shape, dtype=np.float32)
action_bounds = np.inf * np.ones((self.mp.n_basis * self.mp.n_dof)) self.start_pos = start_pos
self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32)
def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, width: float = None, def initialize_mp(self, num_dof: int, duration: int, num_basis: int = 5, width: float = None,
off: float = 0.01, zero_start: bool = False, zero_goal: bool = False): zero_start: bool = False, zero_goal: bool = False, **kwargs):
pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=off, pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=0.01,
zero_start=zero_start, zero_goal=zero_goal) zero_start=zero_start, zero_goal=zero_goal)
weights = np.zeros(shape=(num_basis, num_dof)) weights = np.zeros(shape=(num_basis, num_dof))
@ -33,10 +40,15 @@ class DetPMPWrapper(MPWrapper):
return pmp return pmp
def mp_rollout(self, action): def mp_rollout(self, action):
params = np.reshape(action, newshape=(self.mp.n_basis, self.mp.n_dof)) * self.weights_scale if self.learn_mp_length:
self.mp.set_weights(self.duration, params) duration = max(1, self.duration*abs(action[0]))
_, des_pos, des_vel, _ = self.mp.compute_trajectory(1 / self.dt, 1.) params = np.reshape(action[1:], (self.mp.n_basis, -1)) * self.weights_scale # TODO: Fix Bug when zero_start is true
else:
duration = self.duration
params = np.reshape(action, (self.mp.n_basis, -1)) * self.weights_scale # TODO: Fix Bug when zero_start is true
self.mp.set_weights(1., params)
_, des_pos, des_vel, _ = self.mp.compute_trajectory(frequency=max(1, duration))
if self.mp.zero_start: if self.mp.zero_start:
des_pos += self.env.start_pos[None, :] des_pos += self.start_pos
return des_pos, des_vel return des_pos, des_vel

View File

@ -4,7 +4,7 @@ from mp_lib import dmps
from mp_lib.basis import DMPBasisGenerator from mp_lib.basis import DMPBasisGenerator
from mp_lib.phase import ExpDecayPhaseGenerator from mp_lib.phase import ExpDecayPhaseGenerator
from alr_envs.utils.mps.mp_environments import AlrEnv from alr_envs.utils.mps.alr_env import AlrEnv
from alr_envs.utils.mps.mp_wrapper import MPWrapper from alr_envs.utils.mps.mp_wrapper import MPWrapper
@ -32,26 +32,26 @@ class DmpWrapper(MPWrapper):
goal_scale: goal_scale:
""" """
self.learn_goal = learn_goal self.learn_goal = learn_goal
dt = env.dt if hasattr(env, "dt") else dt
assert dt is not None
self.t = np.linspace(0, duration, int(duration / dt)) self.t = np.linspace(0, duration, int(duration / dt))
self.goal_scale = goal_scale self.goal_scale = goal_scale
super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, render_mode, super().__init__(env=env, num_dof=num_dof, duration=duration, post_traj_time=post_traj_time,
policy_type=policy_type, weights_scale=weights_scale, render_mode=render_mode,
num_basis=num_basis, alpha_phase=alpha_phase, bandwidth_factor=bandwidth_factor) num_basis=num_basis, alpha_phase=alpha_phase, bandwidth_factor=bandwidth_factor)
action_bounds = np.inf * np.ones((np.prod(self.mp.weights.shape) + (num_dof if learn_goal else 0))) action_bounds = np.inf * np.ones((np.prod(self.mp.weights.shape) + (num_dof if learn_goal else 0)))
self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32)
def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, alpha_phase: float = 2., def initialize_mp(self, num_dof: int, duration: int, num_basis: int, alpha_phase: float = 2.,
bandwidth_factor: int = 3): bandwidth_factor: int = 3, **kwargs):
phase_generator = ExpDecayPhaseGenerator(alpha_phase=alpha_phase, duration=duration) phase_generator = ExpDecayPhaseGenerator(alpha_phase=alpha_phase, duration=duration)
basis_generator = DMPBasisGenerator(phase_generator, duration=duration, num_basis=num_basis, basis_generator = DMPBasisGenerator(phase_generator, duration=duration, num_basis=num_basis,
basis_bandwidth_factor=bandwidth_factor) basis_bandwidth_factor=bandwidth_factor)
dmp = dmps.DMP(num_dof=num_dof, basis_generator=basis_generator, phase_generator=phase_generator, dmp = dmps.DMP(num_dof=num_dof, basis_generator=basis_generator, phase_generator=phase_generator,
duration=duration, dt=dt) dt=self.dt)
return dmp return dmp

View File

@ -1,16 +1,36 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Union
import gym import gym
import numpy as np import numpy as np
from alr_envs.utils.mps.mp_environments import AlrEnv from alr_envs.utils.mps.alr_env import AlrEnv
from alr_envs.utils.policies import get_policy_class from alr_envs.utils.policies import get_policy_class, BaseController
class MPWrapper(gym.Wrapper, ABC): class MPWrapper(gym.Wrapper, ABC):
"""
Base class for movement primitive based gym.Wrapper implementations.
def __init__(self, env: AlrEnv, num_dof: int, dt: float, duration: float = 1, post_traj_time: float = 0., :param env: The (wrapped) environment this wrapper is applied on
policy_type: str = None, weights_scale: float = 1., render_mode: str = None, **mp_kwargs): :param num_dof: Dimension of the action space of the wrapped env
:param duration: Number of timesteps in the trajectory of the movement primitive
:param post_traj_time: Time for which the last position of the trajectory is fed to the environment to continue
simulation
:param policy_type: Type or object defining the policy that is used to generate action based on the trajectory
:param weight_scale: Scaling parameter for the actions given to this wrapper
:param render_mode: Equivalent to gym render mode
"""
def __init__(self,
env: AlrEnv,
num_dof: int,
duration: int = 1,
post_traj_time: float = 0.,
policy_type: Union[str, BaseController] = None,
weights_scale: float = 1.,
render_mode: str = None,
**mp_kwargs
):
super().__init__(env) super().__init__(env)
# adjust observation space to reduce version # adjust observation space to reduce version
@ -19,14 +39,15 @@ class MPWrapper(gym.Wrapper, ABC):
high=obs_sp.high[self.env.active_obs], high=obs_sp.high[self.env.active_obs],
dtype=obs_sp.dtype) dtype=obs_sp.dtype)
assert dt is not None # this should never happen as MPWrapper is a base class self.post_traj_steps = int(post_traj_time / env.dt)
self.post_traj_steps = int(post_traj_time / dt)
self.mp = self.initialize_mp(num_dof, duration, dt, **mp_kwargs) self.mp = self.initialize_mp(num_dof=num_dof, duration=duration, **mp_kwargs)
self.weights_scale = weights_scale self.weights_scale = weights_scale
policy_class = get_policy_class(policy_type) if type(policy_type) is str:
self.policy = policy_class(env) self.policy = get_policy_class(policy_type, env, mp_kwargs)
else:
self.policy = policy_type
# rendering # rendering
self.render_mode = render_mode self.render_mode = render_mode
@ -62,29 +83,30 @@ class MPWrapper(gym.Wrapper, ABC):
if self.post_traj_steps > 0: if self.post_traj_steps > 0:
trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])]) trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.n_dof))]) velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dimensions))])
# self._trajectory = trajectory
# self._velocity = velocity
rewards = 0
info = {}
# create random obs as the reset function is called externally
obs = self.env.observation_space.sample()
trajectory_length = len(trajectory)
actions = np.zeros(shape=(trajectory_length, self.mp.num_dimensions))
observations= np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape)
rewards = np.zeros(shape=(trajectory_length,))
trajectory_return = 0
infos = dict(step_infos =[])
for t, pos_vel in enumerate(zip(trajectory, velocity)): for t, pos_vel in enumerate(zip(trajectory, velocity)):
ac = self.policy.get_action(pos_vel[0], pos_vel[1]) actions[t,:] = self.policy.get_action(pos_vel[0], pos_vel[1])
obs, rew, done, info = self.env.step(ac) observations[t,:], rewards[t], done, info = self.env.step(actions[t,:])
rewards += rew trajectory_return += rewards[t]
# TODO return all dicts? infos['step_infos'].append(info)
# [infos[k].append(v) for k, v in info.items()]
if self.render_mode: if self.render_mode:
self.env.render(mode=self.render_mode, **self.render_kwargs) self.env.render(mode=self.render_mode, **self.render_kwargs)
if done: if done:
break break
infos['step_actions'] = actions[:t+1]
infos['step_observations'] = observations[:t+1]
infos['step_rewards'] = rewards[:t+1]
infos['trajectory_length'] = t+1
done = True done = True
return obs[self.env.active_obs], rewards, done, info return observations[t][self.env.active_obs], trajectory_return, done, infos
def render(self, mode='human', **kwargs): def render(self, mode='human', **kwargs):
"""Only set render options here, such that they can be used during the rollout. """Only set render options here, such that they can be used during the rollout.
@ -102,7 +124,7 @@ class MPWrapper(gym.Wrapper, ABC):
raise NotImplementedError() raise NotImplementedError()
@abstractmethod @abstractmethod
def initialize_mp(self, num_dof: int, duration: float, dt: float, **kwargs): def initialize_mp(self, num_dof: int, duration: float, **kwargs):
""" """
Create respective instance of MP Create respective instance of MP
Returns: Returns:

View File

@ -1,10 +1,12 @@
from typing import Tuple, Union
from gym import Env from gym import Env
from alr_envs.mujoco.alr_mujoco_env import AlrMujocoEnv from alr_envs.utils.positional_env import PositionalEnv
class BaseController: class BaseController:
def __init__(self, env: Env): def __init__(self, env: Env, **kwargs):
self.env = env self.env = env
def get_action(self, des_pos, des_vel): def get_action(self, des_pos, des_vel):
@ -22,27 +24,38 @@ class VelController(BaseController):
class PDController(BaseController): class PDController(BaseController):
def __init__(self, env: AlrMujocoEnv): """
self.p_gains = env.p_gains A PD-Controller. Using position and velocity information from a provided positional environment,
self.d_gains = env.d_gains the controller calculates a response based on the desired position and velocity
super(PDController, self).__init__(env)
:param env: A position environment
:param p_gains: Factors for the proportional gains
:param d_gains: Factors for the differential gains
"""
def __init__(self,
env: PositionalEnv,
p_gains: Union[float, Tuple],
d_gains: Union[float, Tuple]):
self.p_gains = p_gains
self.d_gains = d_gains
super(PDController, self).__init__(env, )
def get_action(self, des_pos, des_vel): def get_action(self, des_pos, des_vel):
# TODO: make standardized ALRenv such that all of them have current_pos/vel attributes
cur_pos = self.env.current_pos cur_pos = self.env.current_pos
cur_vel = self.env.current_vel cur_vel = self.env.current_vel
if len(des_pos) != len(cur_pos): assert des_pos.shape != cur_pos.shape, \
des_pos = self.env.extend_des_pos(des_pos) "Mismatch in dimension between desired position {} and current position {}".format(des_pos.shape, cur_pos.shape)
if len(des_vel) != len(cur_vel): assert des_vel.shape != cur_vel.shape, \
des_vel = self.env.extend_des_vel(des_vel) "Mismatch in dimension between desired velocity {} and current velocity {}".format(des_vel.shape,
cur_vel.shape)
trq = self.p_gains * (des_pos - cur_pos) + self.d_gains * (des_vel - cur_vel) trq = self.p_gains * (des_pos - cur_pos) + self.d_gains * (des_vel - cur_vel)
return trq return trq
def get_policy_class(policy_type): def get_policy_class(policy_type, env, mp_kwargs, **kwargs):
if policy_type == "motor": if policy_type == "motor":
return PDController return PDController(env, p_gains=mp_kwargs['p_gains'], d_gains=mp_kwargs['d_gains'])
elif policy_type == "velocity": elif policy_type == "velocity":
return VelController return VelController(env)
elif policy_type == "position": elif policy_type == "position":
return PosController return PosController(env)

View File

@ -0,0 +1,22 @@
from abc import abstractmethod
from typing import Union, Tuple
import numpy as np
from gym import Env
class PositionalEnv(Env):
"""A position and velocity based environment. It functions just as any regular OpenAI Gym
environment but it provides position, velocity and acceleration information. This usually means that the
corresponding information from the agent is forwarded via the properties.
PD-Controller based policies require this environment to calculate the state dependent actions for example.
"""
@property
@abstractmethod
def current_pos(self) -> Union[float, int, np.ndarray, Tuple]:
raise NotImplementedError
@property
@abstractmethod
def current_vel(self) -> Union[float, int, np.ndarray, Tuple]:
raise NotImplementedError