first clean up and some non working ideas sketched

This commit is contained in:
Fabian Otto 2022-06-28 16:05:09 +02:00
parent 7bd9848c31
commit 9ad6fbe712
10 changed files with 225 additions and 260 deletions

View File

@ -1,26 +1,32 @@
## ALR Robotics Control Environments
This project offers a large variety of reinforcement learning environments under the unifying interface of [OpenAI gym](https://gym.openai.com/).
Besides, we also provide support (under the OpenAI interface) for the benchmark suites
We provide support (under the OpenAI interface) for the benchmark suites
[DeepMind Control](https://deepmind.com/research/publications/2020/dm-control-Software-and-Tasks-for-Continuous-Control)
(DMC) and [Metaworld](https://meta-world.github.io/). Custom (Mujoco) gym environments can be created according
to [this guide](https://github.com/openai/gym/blob/master/docs/creating-environments.md). Unlike existing libraries, we
additionally support to control agents with Dynamic Movement Primitives (DMPs) and Probabilistic Movement Primitives (ProMP,
we only consider the mean usually).
(DMC) and [Metaworld](https://meta-world.github.io/).
Custom (Mujoco) gym environments can be created according
to [this guide](https://www.gymlibrary.ml/content/environment_creation/).
Unlike existing libraries, we additionally support to control agents with movement primitives, such as
Dynamic Movement Primitives (DMPs) and Probabilistic Movement Primitives (ProMP, we only consider the mean usually).
## Motion Primitive Environments (Episodic environments)
## Movement Primitive Environments (Episode-Based/Black-Box Environments)
Unlike step-based environments, motion primitive (MP) environments are closer related to stochastic search, black box
optimization, and methods that are often used in robotics. MP environments are trajectory-based and always execute a full
trajectory, which is generated by a Dynamic Motion Primitive (DMP) or a Probabilistic Motion Primitive (ProMP). The
generated trajectory is translated into individual step-wise actions by a controller. The exact choice of controller is,
however, dependent on the type of environment. We currently support position, velocity, and PD-Controllers for position,
velocity, and torque control, respectively. The goal of all MP environments is still to learn a policy. Yet, an action
Unlike step-based environments, movement primitive (MP) environments are closer related to stochastic search, black-box
optimization, and methods that are often used in traditional robotics and control.
MP environments are episode-based and always execute a full trajectory, which is generated by a trajectory generator,
such as a Dynamic Movement Primitive (DMP) or a Probabilistic Movement Primitive (ProMP).
The generated trajectory is translated into individual step-wise actions by a trajectory tracking controller.
The exact choice of controller is, however, dependent on the type of environment.
We currently support position, velocity, and PD-Controllers for position, velocity, and torque control, respectively
as well as a special controller for the MetaWorld control suite.
The goal of all MP environments is still to learn a optimal policy. Yet, an action
represents the parametrization of the motion primitives to generate a suitable trajectory. Additionally, in this
framework we support all of this also for the contextual setting, for which we expose all changing substates of the
task as a single observation in the beginning. This requires to predict a new action/MP parametrization for each
trajectory. All environments provide next to the cumulative episode reward all collected information from each
step as part of the info dictionary. This information should, however, mainly be used for debugging and logging.
framework we support all of this also for the contextual setting, i.e. we expose a subset of the observation space
as a single context in the beginning of the episode. This requires to predict a new action/MP parametrization for each
context.
All environments provide next to the cumulative episode reward all collected information from each
step as part of the info dictionary. This information is, however, mainly meant for debugging as well as logging
and not for training.
|Key| Description|
|---|---|

View File

@ -4,10 +4,11 @@ import numpy as np
from gym.envs.mujoco import MujocoEnv
class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
def __init__(self, n_substeps=4, apply_gravity_comp=True, simplified: bool = False,
reward_type: str = None, context: np.ndarray = None):
def __init__(
self, n_substeps=4, apply_gravity_comp=True, simplified: bool = False,
reward_type: str = None, context: np.ndarray = None
):
utils.EzPickle.__init__(**locals())
self._steps = 0
@ -23,9 +24,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
self.context = context
alr_mujoco_env.AlrMujocoEnv.__init__(self,
self.xml_path,
apply_gravity_comp=apply_gravity_comp,
alr_mujoco_env.AlrMujocoEnv.__init__(self, self.xml_path, apply_gravity_comp=apply_gravity_comp,
n_substeps=n_substeps)
self._start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
self._start_vel = np.zeros(7)
@ -129,7 +128,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
np.sin(theta),
# self.get_body_com("target"), # only return target to make problem harder
[self._steps],
])
])
# TODO
@property
@ -139,7 +138,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
[False] * 7, # sin
# [True] * 2, # x-y coordinates of target distance
[False] # env steps
])
])
# These functions are for the task with 3 joint actuations
def extend_des_pos(self, des_pos):

View File

@ -67,7 +67,7 @@ class BallInACupReward(alr_reward_fct.AlrReward):
action_cost = np.sum(np.square(action))
self.action_costs.append(action_cost)
self._is_collided = self.check_collision(self.env.sim) or self.env.check_traj_in_joint_limits()
self._is_collided = self.check_collision(self.env.sim) or self.env._check_traj_in_joint_limits()
if self.env._steps == self.env.ep_length - 1 or self._is_collided:
t_min_dist = np.argmin(self.dists)

View File

@ -1,11 +1,11 @@
import mujoco_py.builder
import os
import mujoco_py.builder
import numpy as np
from gym import utils, spaces
from gym import utils
from gym.envs.mujoco import MujocoEnv
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
CUP_POS_MIN = np.array([-1.42, -4.05])
CUP_POS_MAX = np.array([1.42, -1.25])
@ -18,12 +18,14 @@ CUP_POS_MAX = np.array([1.42, -1.25])
# CUP_POS_MIN = np.array([-0.16, -2.2])
# CUP_POS_MAX = np.array([0.16, -1.7])
class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False,
rndm_goal=False, cup_goal_pos=None):
def __init__(
self, frame_skip=1, apply_gravity_comp=True, noisy=False,
rndm_goal=False, cup_goal_pos=None
):
cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840])
if cup_goal_pos.shape[0]==2:
if cup_goal_pos.shape[0] == 2:
cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840)
self.cup_goal_pos = np.array(cup_goal_pos)
@ -50,7 +52,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
# self._release_step = 130 # time step of ball release
self.release_step = 100 # time step of ball release
self.ep_length = 600//frame_skip
self.ep_length = 600 // frame_skip
self.cup_table_id = 10
if noisy:
@ -71,14 +73,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
def start_vel(self):
return self._start_vel
@property
def current_pos(self):
return self.sim.data.qpos[0:7].copy()
@property
def current_vel(self):
return self.sim.data.qvel[0:7].copy()
def reset(self):
self.reward_function.reset(self.add_noise)
return super().reset()
@ -122,7 +116,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
self.sim.data.qpos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy()
self.sim.data.qvel[7::] = self.sim.data.site_xvelp[self.ball_site_id, :].copy()
elif self._steps == self.release_step and self.add_noise:
self.sim.data.qvel[7::] += self.noise_std * np.random.randn(3)
self.sim.data.qvel[7::] += self.noise_std * np.random.randn(3)
crash = False
except mujoco_py.builder.MujocoException:
crash = True
@ -147,29 +141,32 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
ball_pos = np.zeros(3)
ball_vel = np.zeros(3)
infos = dict(reward_dist=reward_dist,
reward=reward,
velocity=angular_vel,
# traj=self._q_pos,
action=a,
q_pos=self.sim.data.qpos[0:7].ravel().copy(),
q_vel=self.sim.data.qvel[0:7].ravel().copy(),
ball_pos=ball_pos,
ball_vel=ball_vel,
success=success,
is_collided=is_collided, sim_crash=crash,
table_contact_first=int(not self.reward_function.ball_ground_contact_first))
infos = dict(
reward_dist=reward_dist,
reward=reward,
velocity=angular_vel,
# traj=self._q_pos,
action=a,
q_pos=self.sim.data.qpos[0:7].ravel().copy(),
q_vel=self.sim.data.qvel[0:7].ravel().copy(),
ball_pos=ball_pos,
ball_vel=ball_vel,
success=success,
is_collided=is_collided, sim_crash=crash,
table_contact_first=int(not self.reward_function.ball_ground_contact_first)
)
infos.update(reward_infos)
return ob, reward, done, infos
def check_traj_in_joint_limits(self):
def _check_traj_in_joint_limits(self):
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
def _get_obs(self):
theta = self.sim.data.qpos.flat[:7]
theta_dot = self.sim.data.qvel.flat[:7]
ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_final_table"]].copy()
cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[
self.sim.model._site_name2id["cup_goal_final_table"]].copy()
cup_goal_diff_top = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_table"]].copy()
return np.concatenate([
np.cos(theta),
@ -179,17 +176,21 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
cup_goal_diff_top,
self.sim.model.body_pos[self.cup_table_id][:2].copy(),
[self._steps],
])
])
def compute_reward(self):
@property
def dt(self):
return super(ALRBeerBongEnv, self).dt*self.repeat_action
return super(ALRBeerBongEnv, self).dt * self.repeat_action
class ALRBeerBongEnvFixedReleaseStep(ALRBeerBongEnv):
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
self.release_step = 62 # empirically evaluated for frame_skip=2!
class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
@ -202,54 +203,21 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
reward = 0
done = False
while not done:
sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape))
sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(
np.zeros(a.shape))
reward += sub_reward
infos = sub_infos
ob = sub_ob
ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
# internal steps and thus, the observation also needs to be set correctly
ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
# internal steps and thus, the observation also needs to be set correctly
return ob, reward, done, infos
# class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
# def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
# super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
# self.release_step = 62 # empirically evaluated for frame_skip=2!
#
# def step(self, a):
# if self._steps < self.release_step:
# return super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(a)
# else:
# sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape))
# reward = sub_reward
# infos = sub_infos
# ob = sub_ob
# ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
# # internal steps and thus, the observation also needs to be set correctly
# return ob, reward, done, infos
class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
self.release_step = 62 # empirically evaluated for frame_skip=2!
# def _set_action_space(self):
# bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space()
# min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype)
# max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype)
# self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype)
# return self.action_space
# def step(self, a):
# self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step
# return super(ALRBeerBongEnvStepBased, self).step(a[1:])
#
# def reset(self):
# ob = super(ALRBeerBongEnvStepBased, self).reset()
# self.release_step = self.ep_length + 1
# return ob
def step(self, a):
if self._steps < self.release_step:
return super(ALRBeerBongEnvStepBased, self).step(a)
@ -267,9 +235,9 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
self.sim.model._site_name2id["cup_goal_table"]].copy())
if sub_infos['success']:
dist_rew = -cup_goal_dist_final**2
dist_rew = -cup_goal_dist_final ** 2
else:
dist_rew = -0.5*cup_goal_dist_final**2 - cup_goal_dist_top**2
dist_rew = -0.5 * cup_goal_dist_final ** 2 - cup_goal_dist_top ** 2
reward = reward - sub_infos['action_cost'] + dist_rew
infos = sub_infos
ob = sub_ob
@ -278,13 +246,13 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
return ob, reward, done, infos
if __name__ == "__main__":
# env = ALRBeerBongEnv(rndm_goal=True)
# env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True)
# env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True)
env = ALRBeerBongEnvFixedReleaseStep(frame_skip=2, rndm_goal=True)
import time
env.reset()
env.render("human")
for i in range(1500):

View File

@ -29,52 +29,38 @@ class BeerPongReward:
# "cup_base_table"
]
self.ball_traj = None
self.dists = None
self.dists_final = None
self.costs = None
self.action_costs = None
self.angle_rewards = None
self.cup_angles = None
self.cup_z_axes = None
self.collision_penalty = 500
self.reset(None)
self.reset()
self.is_initialized = False
def reset(self, noisy):
self.ball_traj = []
def reset(self):
self.dists = []
self.dists_final = []
self.costs = []
self.action_costs = []
self.angle_rewards = []
self.cup_angles = []
self.cup_z_axes = []
self.ball_ground_contact_first = False
self.ball_table_contact = False
self.ball_wall_contact = False
self.ball_cup_contact = False
self.ball_in_cup = False
self.dist_ground_cup = -1 # distance floor to cup if first floor contact
self.noisy_bp = noisy
self._t_min_final_dist = -1
self.dist_ground_cup = -1 # distance floor to cup if first floor contact
def initialize(self, env):
if not self.is_initialized:
self.is_initialized = True
self.ball_id = env.sim.model._body_name2id["ball"]
self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"]
self.goal_id = env.sim.model._site_name2id["cup_goal_table"]
self.goal_final_id = env.sim.model._site_name2id["cup_goal_final_table"]
self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects]
self.cup_table_id = env.sim.model._body_name2id["cup_table"]
self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"]
self.table_collision_id = env.sim.model._geom_name2id["table_contact_geom"]
self.wall_collision_id = env.sim.model._geom_name2id["wall"]
self.cup_table_collision_id = env.sim.model._geom_name2id["cup_base_table_contact"]
self.init_ball_pos_site_id = env.sim.model._site_name2id["init_ball_pos_site"]
self.ground_collision_id = env.sim.model._geom_name2id["ground"]
self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects]
self.robot_collision_ids = [env.sim.model._geom_name2id[name] for name in self.robot_collision_objects]
def compute_reward(self, env, action):
@ -87,7 +73,7 @@ class BeerPongReward:
self.check_contacts(env.sim)
self.dists.append(np.linalg.norm(goal_pos - ball_pos))
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \
self.dist_ground_cup = np.linalg.norm(ball_pos - goal_pos) \
if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup
action_cost = np.sum(np.square(action))
self.action_costs.append(np.copy(action_cost))
@ -99,7 +85,7 @@ class BeerPongReward:
min_dist = np.min(self.dists)
final_dist = self.dists_final[-1]
if self.ball_ground_contact_first:
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 2, -4 # relative rew offset when first bounding on ground
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 2, -4 # relative rew offset when first bounding on ground
# min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -6 # absolute rew offset when first bouncing on ground
else:
if not self.ball_in_cup:
@ -108,18 +94,18 @@ class BeerPongReward:
else:
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -2
else:
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0, 0
# dist_ground_cup = 1 * self.dist_ground_cup
action_cost = 1e-4 * np.mean(action_cost)
reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \
action_cost - ground_contact_dist_coeff*self.dist_ground_cup ** 2
action_cost - ground_contact_dist_coeff * self.dist_ground_cup ** 2
# 1e-7*np.mean(action_cost)
# release step punishment
min_time_bound = 0.1
max_time_bound = 1.0
release_time = env.release_step*env.dt
release_time_rew = int(release_time<min_time_bound)*(-30-10*(release_time-min_time_bound)**2) \
+int(release_time>max_time_bound)*(-30-10*(release_time-max_time_bound)**2)
release_time = env.release_step * env.dt
release_time_rew = int(release_time < min_time_bound) * (-30 - 10 * (release_time - min_time_bound) ** 2) \
+ int(release_time > max_time_bound) * (-30 - 10 * (release_time - max_time_bound) ** 2)
reward += release_time_rew
success = self.ball_in_cup
# print('release time :', release_time)
@ -147,17 +133,17 @@ class BeerPongReward:
self.table_collision_id)
if not self.ball_cup_contact:
self.ball_cup_contact = self._check_collision_with_set_of_objects(sim, self.ball_collision_id,
self.cup_collision_ids)
self.cup_collision_ids)
if not self.ball_wall_contact:
self.ball_wall_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
self.wall_collision_id)
self.wall_collision_id)
if not self.ball_in_cup:
self.ball_in_cup = self._check_collision_single_objects(sim, self.ball_collision_id,
self.cup_table_collision_id)
if not self.ball_ground_contact_first:
if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact and not self.ball_in_cup:
self.ball_ground_contact_first = self._check_collision_single_objects(sim, self.ball_collision_id,
self.ground_collision_id)
self.ground_collision_id)
def _check_collision_single_objects(self, sim, id_1, id_2):
for coni in range(0, sim.data.ncon):
@ -190,4 +176,4 @@ class BeerPongReward:
if collision or collision_trans:
return True
return False
return False

View File

@ -1,10 +1,15 @@
from alr_envs.mp.episodic_wrapper import EpisodicWrapper
from typing import Union, Tuple
from typing import Tuple, Union
import numpy as np
import gym
from alr_envs.mp.episodic_wrapper import EpisodicWrapper
class NewMPWrapper(EpisodicWrapper):
# def __init__(self, replanning_model):
# self.replanning_model = replanning_model
@property
def current_pos(self) -> Union[float, int, np.ndarray, Tuple]:
return self.env.sim.data.qpos[0:7].copy()
@ -22,26 +27,16 @@ class NewMPWrapper(EpisodicWrapper):
[False] * 3, # cup_goal_diff_top
[True] * 2, # xy position of cup
[False] # env steps
])
])
# def set_mp_action_space(self):
# min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
# if self.mp.learn_tau:
# min_action_bounds[0] = 20*self.env.dt
# max_action_bounds[0] = 260*self.env.dt
# mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(),
# dtype=np.float32)
# return mp_action_space
# def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]:
# if self.mp.learn_tau:
# return np.concatenate((step_action, np.atleast_1d(env_spec_params)))
# else:
# return step_action
def do_replanning(self, pos, vel, s, a, t, last_replan_step):
return False
# const = np.arange(0, 1000, 10)
# return bool(self.replanning_model(s))
def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
if self.mp.learn_tau:
self.env.env.release_step = action[0]/self.env.dt # Tau value
self.env.env.release_step = action[0] / self.env.dt # Tau value
return action, None
else:
return action, None
@ -52,12 +47,3 @@ class NewMPWrapper(EpisodicWrapper):
xyz[-1] = 0.840
self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz
return self.get_observation_from_step(self.env.env._get_obs())
# def set_action_space(self):
# if self.mp.learn_tau:
# min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
# min_action_bounds = np.concatenate((min_action_bounds.numpy(), [self.env.action_space.low[-1]]))
# max_action_bounds = np.concatenate((max_action_bounds.numpy(), [self.env.action_space.high[-1]]))
# self.action_space = gym.spaces.Box(low=min_action_bounds, high=max_action_bounds, dtype=np.float32)
# return self.action_space
# else:
# return super(NewMPWrapper, self).set_action_space()

View File

@ -14,20 +14,23 @@ class ALRHopperJumpEnv(HopperEnv):
- exclude current positions from observatiosn is set to False
"""
def __init__(self,
xml_file='hopper_jump.xml',
forward_reward_weight=1.0,
ctrl_cost_weight=1e-3,
healthy_reward=0.0,
penalty=0.0,
context=True,
terminate_when_unhealthy=False,
healthy_state_range=(-100.0, 100.0),
healthy_z_range=(0.5, float('inf')),
healthy_angle_range=(-float('inf'), float('inf')),
reset_noise_scale=5e-3,
exclude_current_positions_from_observation=False,
max_episode_steps=250):
def __init__(
self,
xml_file='hopper_jump.xml',
forward_reward_weight=1.0,
ctrl_cost_weight=1e-3,
healthy_reward=0.0,
penalty=0.0,
context=True,
terminate_when_unhealthy=False,
healthy_state_range=(-100.0, 100.0),
healthy_z_range=(0.5, float('inf')),
healthy_angle_range=(-float('inf'), float('inf')),
reset_noise_scale=5e-3,
exclude_current_positions_from_observation=False,
max_episode_steps=250
):
self.current_step = 0
self.max_height = 0
self.max_episode_steps = max_episode_steps
@ -47,7 +50,7 @@ class ALRHopperJumpEnv(HopperEnv):
exclude_current_positions_from_observation)
def step(self, action):
self.current_step += 1
self.do_simulation(action, self.frame_skip)
height_after = self.get_body_com("torso")[2]
@ -59,22 +62,14 @@ class ALRHopperJumpEnv(HopperEnv):
done = False
rewards = 0
if self.current_step >= self.max_episode_steps:
hight_goal_distance = -10*np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height
healthy_reward = 0 if self.context else self.healthy_reward * 2 # self.current_step
height_reward = self._forward_reward_weight * hight_goal_distance # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley
hight_goal_distance = -10 * np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height
healthy_reward = 0 if self.context else self.healthy_reward * 2 # self.current_step
height_reward = self._forward_reward_weight * hight_goal_distance # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley
rewards = height_reward + healthy_reward
# else:
# # penalty for wrong start direction of first two joints; not needed, could be removed
# rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0
observation = self._get_obs()
reward = rewards - costs
# info = {
# 'height' : height_after,
# 'max_height': self.max_height,
# 'goal' : self.goal
# }
info = {
'height': height_after,
'x_pos': site_pos_after,
@ -82,7 +77,7 @@ class ALRHopperJumpEnv(HopperEnv):
'height_rew': self.max_height,
'healthy_reward': self.healthy_reward * 2,
'healthy': self.is_healthy
}
}
return observation, reward, done, info
@ -90,7 +85,7 @@ class ALRHopperJumpEnv(HopperEnv):
return np.append(super()._get_obs(), self.goal)
def reset(self):
self.goal = self.np_random.uniform(1.4, 2.16, 1)[0] # 1.3 2.3
self.goal = self.np_random.uniform(1.4, 2.16, 1)[0] # 1.3 2.3
self.max_height = 0
self.current_step = 0
return super().reset()
@ -98,8 +93,8 @@ class ALRHopperJumpEnv(HopperEnv):
# overwrite reset_model to make it deterministic
def reset_model(self):
qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv)
qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv)
self.set_state(qpos, qvel)
@ -147,7 +142,7 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
self.contact_with_floor = floor_contact
if self.contact_dist is None and self.contact_with_floor:
self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')]
self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')]
- np.array([self.goal, 0, 0]))
ctrl_cost = self.control_cost(action)
@ -157,9 +152,9 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
rewards = 0
if self.current_step >= self.max_episode_steps:
# healthy_reward = 0 if self.context else self.healthy_reward * self.current_step
healthy_reward = self.healthy_reward * 2 #* self.current_step
healthy_reward = self.healthy_reward * 2 # * self.current_step
contact_dist = self.contact_dist if self.contact_dist is not None else 5
dist_reward = self._forward_reward_weight * (-3*goal_dist + 10*self.max_height - 2*contact_dist)
dist_reward = self._forward_reward_weight * (-3 * goal_dist + 10 * self.max_height - 2 * contact_dist)
rewards = dist_reward + healthy_reward
observation = self._get_obs()
@ -174,7 +169,7 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
'healthy_reward': self.healthy_reward * 2,
'healthy': self.is_healthy,
'contact_dist': self.contact_dist if self.contact_dist is not None else 0
}
}
return observation, reward, done, info
def reset_model(self):
@ -225,25 +220,28 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0])
return self._get_obs()
class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
def __init__(self,
xml_file='hopper_jump.xml',
forward_reward_weight=1.0,
ctrl_cost_weight=1e-3,
healthy_reward=0.0,
penalty=0.0,
context=True,
terminate_when_unhealthy=False,
healthy_state_range=(-100.0, 100.0),
healthy_z_range=(0.5, float('inf')),
healthy_angle_range=(-float('inf'), float('inf')),
reset_noise_scale=5e-3,
exclude_current_positions_from_observation=False,
max_episode_steps=250,
height_scale = 10,
dist_scale = 3,
healthy_scale = 2):
def __init__(
self,
xml_file='hopper_jump.xml',
forward_reward_weight=1.0,
ctrl_cost_weight=1e-3,
healthy_reward=0.0,
penalty=0.0,
context=True,
terminate_when_unhealthy=False,
healthy_state_range=(-100.0, 100.0),
healthy_z_range=(0.5, float('inf')),
healthy_angle_range=(-float('inf'), float('inf')),
reset_noise_scale=5e-3,
exclude_current_positions_from_observation=False,
max_episode_steps=250,
height_scale=10,
dist_scale=3,
healthy_scale=2
):
self.height_scale = height_scale
self.dist_scale = dist_scale
self.healthy_scale = healthy_scale
@ -263,15 +261,14 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
ctrl_cost = self.control_cost(action)
healthy_reward = self.healthy_reward * self.healthy_scale
height_reward = self.height_scale*height_after
height_reward = self.height_scale * height_after
goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0]
goal_dist_reward = -self.dist_scale*goal_dist
goal_dist_reward = -self.dist_scale * goal_dist
dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward)
reward = -ctrl_cost + healthy_reward + dist_reward
done = False
observation = self._get_obs()
###########################################################
# This is only for logging the distance to goal when first having the contact
##########################################################
@ -297,9 +294,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
'healthy_reward': self.healthy_reward * self.healthy_reward,
'healthy': self.is_healthy,
'contact_dist': self.contact_dist if self.contact_dist is not None else 0
}
}
return observation, reward, done, info
class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
def __init__(self, max_episode_steps=250):
super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False,
@ -309,14 +307,14 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
def reset_model(self):
self._floor_geom_id = self.model.geom_name2id('floor')
self._foot_geom_id = self.model.geom_name2id('foot_geom')
noise_low = -np.ones(self.model.nq)*self._reset_noise_scale
noise_low = -np.ones(self.model.nq) * self._reset_noise_scale
noise_low[1] = 0
noise_low[2] = 0
noise_low[3] = -0.2
noise_low[4] = -0.2
noise_low[5] = -0.1
noise_high = np.ones(self.model.nq)*self._reset_noise_scale
noise_high = np.ones(self.model.nq) * self._reset_noise_scale
noise_high[1] = 0
noise_high[2] = 0
noise_high[3] = 0
@ -325,7 +323,7 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
# rnd_vec[2] *= 0.05 # the angle around the y axis shouldn't be too high as the agent then falls down quickly and
# can not recover
# can not recover
# rnd_vec[1] = np.clip(rnd_vec[1], 0, 0.3)
qpos = self.init_qpos + rnd_vec
qvel = self.init_qvel
@ -341,7 +339,7 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
self.do_simulation(action, self.frame_skip)
self.contact_with_floor = self._contact_checker(self._floor_geom_id, self._foot_geom_id) if not \
self.contact_with_floor else True
self.contact_with_floor else True
height_after = self.get_body_com("torso")[2]
self.max_height = max(height_after, self.max_height) if self.contact_with_floor else 0
@ -365,12 +363,11 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
'height': height_after,
'max_height': self.max_height,
'goal': self.goal
}
}
return observation, reward, done, info
if __name__ == '__main__':
render_mode = "human" # "human" or "partial" or "final"
# env = ALRHopperJumpEnv()
@ -396,4 +393,4 @@ if __name__ == '__main__':
print('After ', i, ' steps, done: ', d)
env.reset()
env.close()
env.close()

View File

@ -9,7 +9,7 @@ from mp_pytorch.mp.mp_interfaces import MPInterface
from alr_envs.mp.controllers.base_controller import BaseController
class EpisodicWrapper(gym.Env, ABC):
class EpisodicWrapper(gym.Env, gym.wrappers.TransformReward, ABC):
"""
Base class for movement primitive based gym.Wrapper implementations.
@ -22,14 +22,19 @@ class EpisodicWrapper(gym.Env, ABC):
weight_scale: Scaling parameter for the actions given to this wrapper
render_mode: Equivalent to gym render mode
"""
def __init__(self,
env: gym.Env,
mp: MPInterface,
controller: BaseController,
duration: float,
render_mode: str = None,
verbose: int = 1,
weight_scale: float = 1):
def __init__(
self,
env: gym.Env,
mp: MPInterface,
controller: BaseController,
duration: float,
render_mode: str = None,
verbose: int = 1,
weight_scale: float = 1,
sequencing=True,
reward_aggregation=np.mean,
):
super().__init__()
self.env = env
@ -40,11 +45,11 @@ class EpisodicWrapper(gym.Env, ABC):
self.duration = duration
self.traj_steps = int(duration / self.dt)
self.post_traj_steps = self.env.spec.max_episode_steps - self.traj_steps
# duration = self.env.max_episode_steps * self.dt
self.controller = controller
self.mp = mp
self.env = env
self.verbose = verbose
self.controller = controller
self.weight_scale = weight_scale
# rendering
@ -52,15 +57,18 @@ class EpisodicWrapper(gym.Env, ABC):
self.render_kwargs = {}
self.time_steps = np.linspace(0, self.duration, self.traj_steps)
self.mp.set_mp_times(self.time_steps)
# self.mp.set_mp_duration(self.time_steps, dt)
# action_bounds = np.inf * np.ones((np.prod(self.mp.num_params)))
self.mp_action_space = self.set_mp_action_space()
self.mp_action_space = self.get_mp_action_space()
self.action_space = self.set_action_space()
self.action_space = self.get_action_space()
self.active_obs = self.set_active_obs()
self.observation_space = spaces.Box(low=self.env.observation_space.low[self.active_obs],
high=self.env.observation_space.high[self.active_obs],
dtype=self.env.observation_space.dtype)
self.verbose = verbose
def get_trajectory(self, action: np.ndarray) -> Tuple:
# TODO: this follows the implementation of the mp_pytorch library which includes the parameters tau and delay at
# the beginning of the array.
@ -69,33 +77,37 @@ class EpisodicWrapper(gym.Env, ABC):
scaled_mp_params[ignore_indices:] *= self.weight_scale
self.mp.set_params(np.clip(scaled_mp_params, self.mp_action_space.low, self.mp_action_space.high))
self.mp.set_boundary_conditions(bc_time=self.time_steps[:1], bc_pos=self.current_pos, bc_vel=self.current_vel)
traj_dict = self.mp.get_mp_trajs(get_pos = True, get_vel = True)
traj_dict = self.mp.get_mp_trajs(get_pos=True, get_vel=True)
trajectory_tensor, velocity_tensor = traj_dict['pos'], traj_dict['vel']
trajectory = trajectory_tensor.numpy()
velocity = velocity_tensor.numpy()
# TODO: Do we need this or does mp_pytorch have this?
if self.post_traj_steps > 0:
trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dof))])
return trajectory, velocity
def set_mp_action_space(self):
def get_mp_action_space(self):
"""This function can be used to set up an individual space for the parameters of the mp."""
min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(),
dtype=np.float32)
dtype=np.float32)
return mp_action_space
def set_action_space(self):
def get_action_space(self):
"""
This function can be used to modify the action space for considering actions which are not learned via motion
primitives. E.g. ball releasing time for the beer pong task. By default, it is the parameter space of the
motion primitive.
Only needs to be overwritten if the action space needs to be modified.
"""
return self.mp_action_space
try:
return self.mp_action_space
except AttributeError:
return self.get_mp_action_space()
def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
"""
@ -111,7 +123,8 @@ class EpisodicWrapper(gym.Env, ABC):
"""
return action, None
def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]:
def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[
np.ndarray]:
"""
This function can be used to modify the step_action with additional parameters e.g. releasing the ball in the
Beerpong env. The parameters used should not be part of the motion primitive parameters.
@ -169,11 +182,15 @@ class EpisodicWrapper(gym.Env, ABC):
mp_params, env_spec_params = self._episode_callback(action)
trajectory, velocity = self.get_trajectory(mp_params)
# TODO
# self.time_steps = np.linspace(0, learned_duration, self.traj_steps)
# self.mp.set_mp_times(self.time_steps)
trajectory_length = len(trajectory)
if self.verbose >=2 :
if self.verbose >= 2:
actions = np.zeros(shape=(trajectory_length,) + self.env.action_space.shape)
observations = np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape,
dtype=self.env.observation_space.dtype)
dtype=self.env.observation_space.dtype)
rewards = np.zeros(shape=(trajectory_length,))
trajectory_return = 0
@ -181,7 +198,7 @@ class EpisodicWrapper(gym.Env, ABC):
for t, pos_vel in enumerate(zip(trajectory, velocity)):
step_action = self.controller.get_action(pos_vel[0], pos_vel[1], self.current_pos, self.current_vel)
step_action = self._step_callback(t, env_spec_params, step_action) # include possible callback info
step_action = self._step_callback(t, env_spec_params, step_action) # include possible callback info
c_action = np.clip(step_action, self.env.action_space.low, self.env.action_space.high)
# print('step/clipped action ratio: ', step_action/c_action)
obs, c_reward, done, info = self.env.step(c_action)
@ -197,7 +214,7 @@ class EpisodicWrapper(gym.Env, ABC):
# infos['step_infos'].append(info)
if self.render_mode:
self.render(mode=self.render_mode, **self.render_kwargs)
if done:
if done or do_replanning(kwargs):
break
infos.update({k: v[:t + 1] for k, v in infos.items()})
if self.verbose >= 2:
@ -235,10 +252,10 @@ class EpisodicWrapper(gym.Env, ABC):
for i in range(des_trajs.shape[1]):
plt.figure(pos_fig.number)
plt.subplot(des_trajs.shape[1], 1, i + 1)
plt.plot(np.ones(des_trajs.shape[0])*self.current_pos[i])
plt.plot(np.ones(des_trajs.shape[0]) * self.current_pos[i])
plt.plot(des_trajs[:, i])
plt.figure(vel_fig.number)
plt.subplot(des_vels.shape[1], 1, i + 1)
plt.plot(np.ones(des_trajs.shape[0])*self.current_vel[i])
plt.plot(np.ones(des_trajs.shape[0]) * self.current_vel[i])
plt.plot(des_vels[:, i])

View File

@ -20,12 +20,13 @@ def make_dmc(
environment_kwargs: dict = {},
time_limit: Union[None, float] = None,
channels_first: bool = True
):
):
# Adopted from: https://github.com/denisyarats/dmc2gym/blob/master/dmc2gym/__init__.py
# License: MIT
# Copyright (c) 2020 Denis Yarats
assert re.match(r"\w+-\w+", id), "env_id does not have the following structure: 'domain_name-task_name'"
if not re.match(r"\w+-\w+", id):
raise ValueError("env_id does not have the following structure: 'domain_name-task_name'")
domain_name, task_name = id.split("-")
env_id = f'dmc_{domain_name}_{task_name}_{seed}-v1'
@ -60,7 +61,7 @@ def make_dmc(
camera_id=camera_id,
frame_skip=frame_skip,
channels_first=channels_first,
),
),
max_episode_steps=max_episode_steps,
)
)
return gym.make(env_id)

View File

@ -55,7 +55,7 @@ def make(env_id: str, seed, **kwargs):
Returns: Gym environment
"""
if any([det_pmp in env_id for det_pmp in ["DetPMP", "detpmp"]]):
if any(deprec in env_id for deprec in ["DetPMP", "detpmp"]):
warnings.warn("DetPMP is deprecated and converted to ProMP")
env_id = env_id.replace("DetPMP", "ProMP")
env_id = env_id.replace("detpmp", "promp")
@ -92,14 +92,17 @@ def make(env_id: str, seed, **kwargs):
from alr_envs import make_dmc
env = make_dmc(env_id, seed=seed, **kwargs)
assert env.base_step_limit == env.spec.max_episode_steps, \
f"The specified 'episode_length' of {env.spec.max_episode_steps} steps for gym is different from " \
f"the DMC environment specification of {env.base_step_limit} steps."
if not env.base_step_limit == env.spec.max_episode_steps:
raise ValueError(f"The specified 'episode_length' of {env.spec.max_episode_steps} steps for gym "
f"is different from the DMC environment specification of {env.base_step_limit} steps.")
return env
def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController,
ep_wrapper_kwargs: Mapping, seed=1, **kwargs):
def _make_wrapped_env(
env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController,
ep_wrapper_kwargs: Mapping, seed=1, **kwargs
):
"""
Helper function for creating a wrapped gym environment using MPs.
It adds all provided wrappers to the specified environment and verifies at least one MPEnvWrapper is
@ -120,13 +123,14 @@ def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MP
# only wrap the environment if not EpisodicWrapper, e.g. for vision
if not issubclass(w, EpisodicWrapper):
_env = w(_env)
else: # if EpisodicWrapper, use specific constructor
else: # if EpisodicWrapper, use specific constructor
has_episodic_wrapper = True
_env = w(env=_env, mp=mp, controller=controller, **ep_wrapper_kwargs)
assert has_episodic_wrapper, \
"At least one MPEnvWrapper is required in order to leverage motion primitive environments."
if not has_episodic_wrapper:
raise ValueError("An EpisodicWrapper is required in order to leverage movement primitive environments.")
return _env
def make_mp_from_kwargs(
env_id: str, wrappers: Iterable, ep_wrapper_kwargs: MutableMapping, mp_kwargs: MutableMapping,
controller_kwargs: MutableMapping, phase_kwargs: MutableMapping, basis_kwargs: MutableMapping, seed=1,
@ -152,7 +156,7 @@ def make_mp_from_kwargs(
_verify_time_limit(mp_kwargs.get("duration", None), kwargs.get("time_limit", None))
dummy_env = make(env_id, seed)
if ep_wrapper_kwargs.get('duration', None) is None:
ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps*dummy_env.dt
ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps * dummy_env.dt
if phase_kwargs.get('tau', None) is None:
phase_kwargs['tau'] = ep_wrapper_kwargs['duration']
mp_kwargs['action_dim'] = mp_kwargs.get('action_dim', np.prod(dummy_env.action_space.shape).item())
@ -207,10 +211,11 @@ def make_mp_env_helper(**kwargs):
phase_kwargs = kwargs.pop("phase_generator_kwargs")
basis_kwargs = kwargs.pop("basis_generator_kwargs")
return make_mp_from_kwargs(env_id=kwargs.pop("name"), wrappers=wrappers, ep_wrapper_kwargs=ep_wrapper_kwargs,
return make_mp_from_kwargs(env_id=kwargs.pop("name"), wrappers=wrappers, ep_wrapper_kwargs=ep_wrapper_kwargs,
mp_kwargs=mp_kwargs, controller_kwargs=contr_kwargs, phase_kwargs=phase_kwargs,
basis_kwargs=basis_kwargs, **kwargs, seed=seed)
def make_dmp_env(env_id: str, wrappers: Iterable, seed=1, mp_kwargs={}, **kwargs):
"""
This can also be used standalone for manually building a custom DMP environment.