first clean up and some non working ideas sketched

This commit is contained in:
Fabian Otto 2022-06-28 16:05:09 +02:00
parent 7bd9848c31
commit 9ad6fbe712
10 changed files with 225 additions and 260 deletions

View File

@ -1,26 +1,32 @@
## ALR Robotics Control Environments ## ALR Robotics Control Environments
This project offers a large variety of reinforcement learning environments under the unifying interface of [OpenAI gym](https://gym.openai.com/). This project offers a large variety of reinforcement learning environments under the unifying interface of [OpenAI gym](https://gym.openai.com/).
Besides, we also provide support (under the OpenAI interface) for the benchmark suites We provide support (under the OpenAI interface) for the benchmark suites
[DeepMind Control](https://deepmind.com/research/publications/2020/dm-control-Software-and-Tasks-for-Continuous-Control) [DeepMind Control](https://deepmind.com/research/publications/2020/dm-control-Software-and-Tasks-for-Continuous-Control)
(DMC) and [Metaworld](https://meta-world.github.io/). Custom (Mujoco) gym environments can be created according (DMC) and [Metaworld](https://meta-world.github.io/).
to [this guide](https://github.com/openai/gym/blob/master/docs/creating-environments.md). Unlike existing libraries, we Custom (Mujoco) gym environments can be created according
additionally support to control agents with Dynamic Movement Primitives (DMPs) and Probabilistic Movement Primitives (ProMP, to [this guide](https://www.gymlibrary.ml/content/environment_creation/).
we only consider the mean usually). Unlike existing libraries, we additionally support to control agents with movement primitives, such as
Dynamic Movement Primitives (DMPs) and Probabilistic Movement Primitives (ProMP, we only consider the mean usually).
## Motion Primitive Environments (Episodic environments) ## Movement Primitive Environments (Episode-Based/Black-Box Environments)
Unlike step-based environments, motion primitive (MP) environments are closer related to stochastic search, black box Unlike step-based environments, movement primitive (MP) environments are closer related to stochastic search, black-box
optimization, and methods that are often used in robotics. MP environments are trajectory-based and always execute a full optimization, and methods that are often used in traditional robotics and control.
trajectory, which is generated by a Dynamic Motion Primitive (DMP) or a Probabilistic Motion Primitive (ProMP). The MP environments are episode-based and always execute a full trajectory, which is generated by a trajectory generator,
generated trajectory is translated into individual step-wise actions by a controller. The exact choice of controller is, such as a Dynamic Movement Primitive (DMP) or a Probabilistic Movement Primitive (ProMP).
however, dependent on the type of environment. We currently support position, velocity, and PD-Controllers for position, The generated trajectory is translated into individual step-wise actions by a trajectory tracking controller.
velocity, and torque control, respectively. The goal of all MP environments is still to learn a policy. Yet, an action The exact choice of controller is, however, dependent on the type of environment.
We currently support position, velocity, and PD-Controllers for position, velocity, and torque control, respectively
as well as a special controller for the MetaWorld control suite.
The goal of all MP environments is still to learn a optimal policy. Yet, an action
represents the parametrization of the motion primitives to generate a suitable trajectory. Additionally, in this represents the parametrization of the motion primitives to generate a suitable trajectory. Additionally, in this
framework we support all of this also for the contextual setting, for which we expose all changing substates of the framework we support all of this also for the contextual setting, i.e. we expose a subset of the observation space
task as a single observation in the beginning. This requires to predict a new action/MP parametrization for each as a single context in the beginning of the episode. This requires to predict a new action/MP parametrization for each
trajectory. All environments provide next to the cumulative episode reward all collected information from each context.
step as part of the info dictionary. This information should, however, mainly be used for debugging and logging. All environments provide next to the cumulative episode reward all collected information from each
step as part of the info dictionary. This information is, however, mainly meant for debugging as well as logging
and not for training.
|Key| Description| |Key| Description|
|---|---| |---|---|

View File

@ -4,10 +4,11 @@ import numpy as np
from gym.envs.mujoco import MujocoEnv from gym.envs.mujoco import MujocoEnv
class ALRBallInACupEnv(MujocoEnv, utils.EzPickle): class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
def __init__(self, n_substeps=4, apply_gravity_comp=True, simplified: bool = False, def __init__(
reward_type: str = None, context: np.ndarray = None): self, n_substeps=4, apply_gravity_comp=True, simplified: bool = False,
reward_type: str = None, context: np.ndarray = None
):
utils.EzPickle.__init__(**locals()) utils.EzPickle.__init__(**locals())
self._steps = 0 self._steps = 0
@ -23,9 +24,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
self.context = context self.context = context
alr_mujoco_env.AlrMujocoEnv.__init__(self, alr_mujoco_env.AlrMujocoEnv.__init__(self, self.xml_path, apply_gravity_comp=apply_gravity_comp,
self.xml_path,
apply_gravity_comp=apply_gravity_comp,
n_substeps=n_substeps) n_substeps=n_substeps)
self._start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57]) self._start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
self._start_vel = np.zeros(7) self._start_vel = np.zeros(7)
@ -129,7 +128,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
np.sin(theta), np.sin(theta),
# self.get_body_com("target"), # only return target to make problem harder # self.get_body_com("target"), # only return target to make problem harder
[self._steps], [self._steps],
]) ])
# TODO # TODO
@property @property
@ -139,7 +138,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
[False] * 7, # sin [False] * 7, # sin
# [True] * 2, # x-y coordinates of target distance # [True] * 2, # x-y coordinates of target distance
[False] # env steps [False] # env steps
]) ])
# These functions are for the task with 3 joint actuations # These functions are for the task with 3 joint actuations
def extend_des_pos(self, des_pos): def extend_des_pos(self, des_pos):

View File

@ -67,7 +67,7 @@ class BallInACupReward(alr_reward_fct.AlrReward):
action_cost = np.sum(np.square(action)) action_cost = np.sum(np.square(action))
self.action_costs.append(action_cost) self.action_costs.append(action_cost)
self._is_collided = self.check_collision(self.env.sim) or self.env.check_traj_in_joint_limits() self._is_collided = self.check_collision(self.env.sim) or self.env._check_traj_in_joint_limits()
if self.env._steps == self.env.ep_length - 1 or self._is_collided: if self.env._steps == self.env.ep_length - 1 or self._is_collided:
t_min_dist = np.argmin(self.dists) t_min_dist = np.argmin(self.dists)

View File

@ -1,11 +1,11 @@
import mujoco_py.builder
import os import os
import mujoco_py.builder
import numpy as np import numpy as np
from gym import utils, spaces from gym import utils
from gym.envs.mujoco import MujocoEnv from gym.envs.mujoco import MujocoEnv
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
CUP_POS_MIN = np.array([-1.42, -4.05]) CUP_POS_MIN = np.array([-1.42, -4.05])
CUP_POS_MAX = np.array([1.42, -1.25]) CUP_POS_MAX = np.array([1.42, -1.25])
@ -18,12 +18,14 @@ CUP_POS_MAX = np.array([1.42, -1.25])
# CUP_POS_MIN = np.array([-0.16, -2.2]) # CUP_POS_MIN = np.array([-0.16, -2.2])
# CUP_POS_MAX = np.array([0.16, -1.7]) # CUP_POS_MAX = np.array([0.16, -1.7])
class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, def __init__(
rndm_goal=False, cup_goal_pos=None): self, frame_skip=1, apply_gravity_comp=True, noisy=False,
rndm_goal=False, cup_goal_pos=None
):
cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840]) cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840])
if cup_goal_pos.shape[0]==2: if cup_goal_pos.shape[0] == 2:
cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840) cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840)
self.cup_goal_pos = np.array(cup_goal_pos) self.cup_goal_pos = np.array(cup_goal_pos)
@ -50,7 +52,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
# self._release_step = 130 # time step of ball release # self._release_step = 130 # time step of ball release
self.release_step = 100 # time step of ball release self.release_step = 100 # time step of ball release
self.ep_length = 600//frame_skip self.ep_length = 600 // frame_skip
self.cup_table_id = 10 self.cup_table_id = 10
if noisy: if noisy:
@ -71,14 +73,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
def start_vel(self): def start_vel(self):
return self._start_vel return self._start_vel
@property
def current_pos(self):
return self.sim.data.qpos[0:7].copy()
@property
def current_vel(self):
return self.sim.data.qvel[0:7].copy()
def reset(self): def reset(self):
self.reward_function.reset(self.add_noise) self.reward_function.reset(self.add_noise)
return super().reset() return super().reset()
@ -122,7 +116,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
self.sim.data.qpos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy() self.sim.data.qpos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy()
self.sim.data.qvel[7::] = self.sim.data.site_xvelp[self.ball_site_id, :].copy() self.sim.data.qvel[7::] = self.sim.data.site_xvelp[self.ball_site_id, :].copy()
elif self._steps == self.release_step and self.add_noise: elif self._steps == self.release_step and self.add_noise:
self.sim.data.qvel[7::] += self.noise_std * np.random.randn(3) self.sim.data.qvel[7::] += self.noise_std * np.random.randn(3)
crash = False crash = False
except mujoco_py.builder.MujocoException: except mujoco_py.builder.MujocoException:
crash = True crash = True
@ -147,29 +141,32 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
ball_pos = np.zeros(3) ball_pos = np.zeros(3)
ball_vel = np.zeros(3) ball_vel = np.zeros(3)
infos = dict(reward_dist=reward_dist, infos = dict(
reward=reward, reward_dist=reward_dist,
velocity=angular_vel, reward=reward,
# traj=self._q_pos, velocity=angular_vel,
action=a, # traj=self._q_pos,
q_pos=self.sim.data.qpos[0:7].ravel().copy(), action=a,
q_vel=self.sim.data.qvel[0:7].ravel().copy(), q_pos=self.sim.data.qpos[0:7].ravel().copy(),
ball_pos=ball_pos, q_vel=self.sim.data.qvel[0:7].ravel().copy(),
ball_vel=ball_vel, ball_pos=ball_pos,
success=success, ball_vel=ball_vel,
is_collided=is_collided, sim_crash=crash, success=success,
table_contact_first=int(not self.reward_function.ball_ground_contact_first)) is_collided=is_collided, sim_crash=crash,
table_contact_first=int(not self.reward_function.ball_ground_contact_first)
)
infos.update(reward_infos) infos.update(reward_infos)
return ob, reward, done, infos return ob, reward, done, infos
def check_traj_in_joint_limits(self): def _check_traj_in_joint_limits(self):
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min) return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
def _get_obs(self): def _get_obs(self):
theta = self.sim.data.qpos.flat[:7] theta = self.sim.data.qpos.flat[:7]
theta_dot = self.sim.data.qvel.flat[:7] theta_dot = self.sim.data.qvel.flat[:7]
ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy() ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_final_table"]].copy() cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[
self.sim.model._site_name2id["cup_goal_final_table"]].copy()
cup_goal_diff_top = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_table"]].copy() cup_goal_diff_top = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_table"]].copy()
return np.concatenate([ return np.concatenate([
np.cos(theta), np.cos(theta),
@ -179,17 +176,21 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
cup_goal_diff_top, cup_goal_diff_top,
self.sim.model.body_pos[self.cup_table_id][:2].copy(), self.sim.model.body_pos[self.cup_table_id][:2].copy(),
[self._steps], [self._steps],
]) ])
def compute_reward(self):
@property @property
def dt(self): def dt(self):
return super(ALRBeerBongEnv, self).dt*self.repeat_action return super(ALRBeerBongEnv, self).dt * self.repeat_action
class ALRBeerBongEnvFixedReleaseStep(ALRBeerBongEnv): class ALRBeerBongEnvFixedReleaseStep(ALRBeerBongEnv):
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
self.release_step = 62 # empirically evaluated for frame_skip=2! self.release_step = 62 # empirically evaluated for frame_skip=2!
class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv): class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
@ -202,54 +203,21 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
reward = 0 reward = 0
done = False done = False
while not done: while not done:
sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape)) sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(
np.zeros(a.shape))
reward += sub_reward reward += sub_reward
infos = sub_infos infos = sub_infos
ob = sub_ob ob = sub_ob
ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
# internal steps and thus, the observation also needs to be set correctly # internal steps and thus, the observation also needs to be set correctly
return ob, reward, done, infos return ob, reward, done, infos
# class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
# def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
# super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
# self.release_step = 62 # empirically evaluated for frame_skip=2!
#
# def step(self, a):
# if self._steps < self.release_step:
# return super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(a)
# else:
# sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape))
# reward = sub_reward
# infos = sub_infos
# ob = sub_ob
# ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
# # internal steps and thus, the observation also needs to be set correctly
# return ob, reward, done, infos
class ALRBeerBongEnvStepBased(ALRBeerBongEnv): class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
self.release_step = 62 # empirically evaluated for frame_skip=2! self.release_step = 62 # empirically evaluated for frame_skip=2!
# def _set_action_space(self):
# bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space()
# min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype)
# max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype)
# self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype)
# return self.action_space
# def step(self, a):
# self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step
# return super(ALRBeerBongEnvStepBased, self).step(a[1:])
#
# def reset(self):
# ob = super(ALRBeerBongEnvStepBased, self).reset()
# self.release_step = self.ep_length + 1
# return ob
def step(self, a): def step(self, a):
if self._steps < self.release_step: if self._steps < self.release_step:
return super(ALRBeerBongEnvStepBased, self).step(a) return super(ALRBeerBongEnvStepBased, self).step(a)
@ -267,9 +235,9 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[ cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
self.sim.model._site_name2id["cup_goal_table"]].copy()) self.sim.model._site_name2id["cup_goal_table"]].copy())
if sub_infos['success']: if sub_infos['success']:
dist_rew = -cup_goal_dist_final**2 dist_rew = -cup_goal_dist_final ** 2
else: else:
dist_rew = -0.5*cup_goal_dist_final**2 - cup_goal_dist_top**2 dist_rew = -0.5 * cup_goal_dist_final ** 2 - cup_goal_dist_top ** 2
reward = reward - sub_infos['action_cost'] + dist_rew reward = reward - sub_infos['action_cost'] + dist_rew
infos = sub_infos infos = sub_infos
ob = sub_ob ob = sub_ob
@ -278,13 +246,13 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
return ob, reward, done, infos return ob, reward, done, infos
if __name__ == "__main__": if __name__ == "__main__":
# env = ALRBeerBongEnv(rndm_goal=True) # env = ALRBeerBongEnv(rndm_goal=True)
# env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True) # env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True)
# env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True) # env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True)
env = ALRBeerBongEnvFixedReleaseStep(frame_skip=2, rndm_goal=True) env = ALRBeerBongEnvFixedReleaseStep(frame_skip=2, rndm_goal=True)
import time import time
env.reset() env.reset()
env.render("human") env.render("human")
for i in range(1500): for i in range(1500):

View File

@ -29,52 +29,38 @@ class BeerPongReward:
# "cup_base_table" # "cup_base_table"
] ]
self.ball_traj = None
self.dists = None self.dists = None
self.dists_final = None self.dists_final = None
self.costs = None
self.action_costs = None self.action_costs = None
self.angle_rewards = None self.reset()
self.cup_angles = None
self.cup_z_axes = None
self.collision_penalty = 500
self.reset(None)
self.is_initialized = False self.is_initialized = False
def reset(self, noisy): def reset(self):
self.ball_traj = []
self.dists = [] self.dists = []
self.dists_final = [] self.dists_final = []
self.costs = []
self.action_costs = [] self.action_costs = []
self.angle_rewards = []
self.cup_angles = []
self.cup_z_axes = []
self.ball_ground_contact_first = False self.ball_ground_contact_first = False
self.ball_table_contact = False self.ball_table_contact = False
self.ball_wall_contact = False self.ball_wall_contact = False
self.ball_cup_contact = False self.ball_cup_contact = False
self.ball_in_cup = False self.ball_in_cup = False
self.dist_ground_cup = -1 # distance floor to cup if first floor contact self.dist_ground_cup = -1 # distance floor to cup if first floor contact
self.noisy_bp = noisy
self._t_min_final_dist = -1
def initialize(self, env): def initialize(self, env):
if not self.is_initialized: if not self.is_initialized:
self.is_initialized = True self.is_initialized = True
self.ball_id = env.sim.model._body_name2id["ball"] self.ball_id = env.sim.model._body_name2id["ball"]
self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"]
self.goal_id = env.sim.model._site_name2id["cup_goal_table"] self.goal_id = env.sim.model._site_name2id["cup_goal_table"]
self.goal_final_id = env.sim.model._site_name2id["cup_goal_final_table"] self.goal_final_id = env.sim.model._site_name2id["cup_goal_final_table"]
self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects]
self.cup_table_id = env.sim.model._body_name2id["cup_table"] self.cup_table_id = env.sim.model._body_name2id["cup_table"]
self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"]
self.table_collision_id = env.sim.model._geom_name2id["table_contact_geom"] self.table_collision_id = env.sim.model._geom_name2id["table_contact_geom"]
self.wall_collision_id = env.sim.model._geom_name2id["wall"] self.wall_collision_id = env.sim.model._geom_name2id["wall"]
self.cup_table_collision_id = env.sim.model._geom_name2id["cup_base_table_contact"] self.cup_table_collision_id = env.sim.model._geom_name2id["cup_base_table_contact"]
self.init_ball_pos_site_id = env.sim.model._site_name2id["init_ball_pos_site"]
self.ground_collision_id = env.sim.model._geom_name2id["ground"] self.ground_collision_id = env.sim.model._geom_name2id["ground"]
self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects]
self.robot_collision_ids = [env.sim.model._geom_name2id[name] for name in self.robot_collision_objects] self.robot_collision_ids = [env.sim.model._geom_name2id[name] for name in self.robot_collision_objects]
def compute_reward(self, env, action): def compute_reward(self, env, action):
@ -87,7 +73,7 @@ class BeerPongReward:
self.check_contacts(env.sim) self.check_contacts(env.sim)
self.dists.append(np.linalg.norm(goal_pos - ball_pos)) self.dists.append(np.linalg.norm(goal_pos - ball_pos))
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \ self.dist_ground_cup = np.linalg.norm(ball_pos - goal_pos) \
if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup
action_cost = np.sum(np.square(action)) action_cost = np.sum(np.square(action))
self.action_costs.append(np.copy(action_cost)) self.action_costs.append(np.copy(action_cost))
@ -99,7 +85,7 @@ class BeerPongReward:
min_dist = np.min(self.dists) min_dist = np.min(self.dists)
final_dist = self.dists_final[-1] final_dist = self.dists_final[-1]
if self.ball_ground_contact_first: if self.ball_ground_contact_first:
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 2, -4 # relative rew offset when first bounding on ground min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 2, -4 # relative rew offset when first bounding on ground
# min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -6 # absolute rew offset when first bouncing on ground # min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -6 # absolute rew offset when first bouncing on ground
else: else:
if not self.ball_in_cup: if not self.ball_in_cup:
@ -108,18 +94,18 @@ class BeerPongReward:
else: else:
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -2 min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -2
else: else:
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0 min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0, 0
# dist_ground_cup = 1 * self.dist_ground_cup # dist_ground_cup = 1 * self.dist_ground_cup
action_cost = 1e-4 * np.mean(action_cost) action_cost = 1e-4 * np.mean(action_cost)
reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \ reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \
action_cost - ground_contact_dist_coeff*self.dist_ground_cup ** 2 action_cost - ground_contact_dist_coeff * self.dist_ground_cup ** 2
# 1e-7*np.mean(action_cost) # 1e-7*np.mean(action_cost)
# release step punishment # release step punishment
min_time_bound = 0.1 min_time_bound = 0.1
max_time_bound = 1.0 max_time_bound = 1.0
release_time = env.release_step*env.dt release_time = env.release_step * env.dt
release_time_rew = int(release_time<min_time_bound)*(-30-10*(release_time-min_time_bound)**2) \ release_time_rew = int(release_time < min_time_bound) * (-30 - 10 * (release_time - min_time_bound) ** 2) \
+int(release_time>max_time_bound)*(-30-10*(release_time-max_time_bound)**2) + int(release_time > max_time_bound) * (-30 - 10 * (release_time - max_time_bound) ** 2)
reward += release_time_rew reward += release_time_rew
success = self.ball_in_cup success = self.ball_in_cup
# print('release time :', release_time) # print('release time :', release_time)
@ -147,17 +133,17 @@ class BeerPongReward:
self.table_collision_id) self.table_collision_id)
if not self.ball_cup_contact: if not self.ball_cup_contact:
self.ball_cup_contact = self._check_collision_with_set_of_objects(sim, self.ball_collision_id, self.ball_cup_contact = self._check_collision_with_set_of_objects(sim, self.ball_collision_id,
self.cup_collision_ids) self.cup_collision_ids)
if not self.ball_wall_contact: if not self.ball_wall_contact:
self.ball_wall_contact = self._check_collision_single_objects(sim, self.ball_collision_id, self.ball_wall_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
self.wall_collision_id) self.wall_collision_id)
if not self.ball_in_cup: if not self.ball_in_cup:
self.ball_in_cup = self._check_collision_single_objects(sim, self.ball_collision_id, self.ball_in_cup = self._check_collision_single_objects(sim, self.ball_collision_id,
self.cup_table_collision_id) self.cup_table_collision_id)
if not self.ball_ground_contact_first: if not self.ball_ground_contact_first:
if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact and not self.ball_in_cup: if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact and not self.ball_in_cup:
self.ball_ground_contact_first = self._check_collision_single_objects(sim, self.ball_collision_id, self.ball_ground_contact_first = self._check_collision_single_objects(sim, self.ball_collision_id,
self.ground_collision_id) self.ground_collision_id)
def _check_collision_single_objects(self, sim, id_1, id_2): def _check_collision_single_objects(self, sim, id_1, id_2):
for coni in range(0, sim.data.ncon): for coni in range(0, sim.data.ncon):
@ -190,4 +176,4 @@ class BeerPongReward:
if collision or collision_trans: if collision or collision_trans:
return True return True
return False return False

View File

@ -1,10 +1,15 @@
from alr_envs.mp.episodic_wrapper import EpisodicWrapper from typing import Tuple, Union
from typing import Union, Tuple
import numpy as np import numpy as np
import gym
from alr_envs.mp.episodic_wrapper import EpisodicWrapper
class NewMPWrapper(EpisodicWrapper): class NewMPWrapper(EpisodicWrapper):
# def __init__(self, replanning_model):
# self.replanning_model = replanning_model
@property @property
def current_pos(self) -> Union[float, int, np.ndarray, Tuple]: def current_pos(self) -> Union[float, int, np.ndarray, Tuple]:
return self.env.sim.data.qpos[0:7].copy() return self.env.sim.data.qpos[0:7].copy()
@ -22,26 +27,16 @@ class NewMPWrapper(EpisodicWrapper):
[False] * 3, # cup_goal_diff_top [False] * 3, # cup_goal_diff_top
[True] * 2, # xy position of cup [True] * 2, # xy position of cup
[False] # env steps [False] # env steps
]) ])
# def set_mp_action_space(self): def do_replanning(self, pos, vel, s, a, t, last_replan_step):
# min_action_bounds, max_action_bounds = self.mp.get_param_bounds() return False
# if self.mp.learn_tau: # const = np.arange(0, 1000, 10)
# min_action_bounds[0] = 20*self.env.dt # return bool(self.replanning_model(s))
# max_action_bounds[0] = 260*self.env.dt
# mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(),
# dtype=np.float32)
# return mp_action_space
# def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]:
# if self.mp.learn_tau:
# return np.concatenate((step_action, np.atleast_1d(env_spec_params)))
# else:
# return step_action
def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]: def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
if self.mp.learn_tau: if self.mp.learn_tau:
self.env.env.release_step = action[0]/self.env.dt # Tau value self.env.env.release_step = action[0] / self.env.dt # Tau value
return action, None return action, None
else: else:
return action, None return action, None
@ -52,12 +47,3 @@ class NewMPWrapper(EpisodicWrapper):
xyz[-1] = 0.840 xyz[-1] = 0.840
self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz
return self.get_observation_from_step(self.env.env._get_obs()) return self.get_observation_from_step(self.env.env._get_obs())
# def set_action_space(self):
# if self.mp.learn_tau:
# min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
# min_action_bounds = np.concatenate((min_action_bounds.numpy(), [self.env.action_space.low[-1]]))
# max_action_bounds = np.concatenate((max_action_bounds.numpy(), [self.env.action_space.high[-1]]))
# self.action_space = gym.spaces.Box(low=min_action_bounds, high=max_action_bounds, dtype=np.float32)
# return self.action_space
# else:
# return super(NewMPWrapper, self).set_action_space()

View File

@ -14,20 +14,23 @@ class ALRHopperJumpEnv(HopperEnv):
- exclude current positions from observatiosn is set to False - exclude current positions from observatiosn is set to False
""" """
def __init__(self, def __init__(
xml_file='hopper_jump.xml', self,
forward_reward_weight=1.0, xml_file='hopper_jump.xml',
ctrl_cost_weight=1e-3, forward_reward_weight=1.0,
healthy_reward=0.0, ctrl_cost_weight=1e-3,
penalty=0.0, healthy_reward=0.0,
context=True, penalty=0.0,
terminate_when_unhealthy=False, context=True,
healthy_state_range=(-100.0, 100.0), terminate_when_unhealthy=False,
healthy_z_range=(0.5, float('inf')), healthy_state_range=(-100.0, 100.0),
healthy_angle_range=(-float('inf'), float('inf')), healthy_z_range=(0.5, float('inf')),
reset_noise_scale=5e-3, healthy_angle_range=(-float('inf'), float('inf')),
exclude_current_positions_from_observation=False, reset_noise_scale=5e-3,
max_episode_steps=250): exclude_current_positions_from_observation=False,
max_episode_steps=250
):
self.current_step = 0 self.current_step = 0
self.max_height = 0 self.max_height = 0
self.max_episode_steps = max_episode_steps self.max_episode_steps = max_episode_steps
@ -47,7 +50,7 @@ class ALRHopperJumpEnv(HopperEnv):
exclude_current_positions_from_observation) exclude_current_positions_from_observation)
def step(self, action): def step(self, action):
self.current_step += 1 self.current_step += 1
self.do_simulation(action, self.frame_skip) self.do_simulation(action, self.frame_skip)
height_after = self.get_body_com("torso")[2] height_after = self.get_body_com("torso")[2]
@ -59,22 +62,14 @@ class ALRHopperJumpEnv(HopperEnv):
done = False done = False
rewards = 0 rewards = 0
if self.current_step >= self.max_episode_steps: if self.current_step >= self.max_episode_steps:
hight_goal_distance = -10*np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height hight_goal_distance = -10 * np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height
healthy_reward = 0 if self.context else self.healthy_reward * 2 # self.current_step healthy_reward = 0 if self.context else self.healthy_reward * 2 # self.current_step
height_reward = self._forward_reward_weight * hight_goal_distance # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley height_reward = self._forward_reward_weight * hight_goal_distance # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley
rewards = height_reward + healthy_reward rewards = height_reward + healthy_reward
# else:
# # penalty for wrong start direction of first two joints; not needed, could be removed
# rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0
observation = self._get_obs() observation = self._get_obs()
reward = rewards - costs reward = rewards - costs
# info = {
# 'height' : height_after,
# 'max_height': self.max_height,
# 'goal' : self.goal
# }
info = { info = {
'height': height_after, 'height': height_after,
'x_pos': site_pos_after, 'x_pos': site_pos_after,
@ -82,7 +77,7 @@ class ALRHopperJumpEnv(HopperEnv):
'height_rew': self.max_height, 'height_rew': self.max_height,
'healthy_reward': self.healthy_reward * 2, 'healthy_reward': self.healthy_reward * 2,
'healthy': self.is_healthy 'healthy': self.is_healthy
} }
return observation, reward, done, info return observation, reward, done, info
@ -90,7 +85,7 @@ class ALRHopperJumpEnv(HopperEnv):
return np.append(super()._get_obs(), self.goal) return np.append(super()._get_obs(), self.goal)
def reset(self): def reset(self):
self.goal = self.np_random.uniform(1.4, 2.16, 1)[0] # 1.3 2.3 self.goal = self.np_random.uniform(1.4, 2.16, 1)[0] # 1.3 2.3
self.max_height = 0 self.max_height = 0
self.current_step = 0 self.current_step = 0
return super().reset() return super().reset()
@ -98,8 +93,8 @@ class ALRHopperJumpEnv(HopperEnv):
# overwrite reset_model to make it deterministic # overwrite reset_model to make it deterministic
def reset_model(self): def reset_model(self):
qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv)
self.set_state(qpos, qvel) self.set_state(qpos, qvel)
@ -147,7 +142,7 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
self.contact_with_floor = floor_contact self.contact_with_floor = floor_contact
if self.contact_dist is None and self.contact_with_floor: if self.contact_dist is None and self.contact_with_floor:
self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')] self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')]
- np.array([self.goal, 0, 0])) - np.array([self.goal, 0, 0]))
ctrl_cost = self.control_cost(action) ctrl_cost = self.control_cost(action)
@ -157,9 +152,9 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
rewards = 0 rewards = 0
if self.current_step >= self.max_episode_steps: if self.current_step >= self.max_episode_steps:
# healthy_reward = 0 if self.context else self.healthy_reward * self.current_step # healthy_reward = 0 if self.context else self.healthy_reward * self.current_step
healthy_reward = self.healthy_reward * 2 #* self.current_step healthy_reward = self.healthy_reward * 2 # * self.current_step
contact_dist = self.contact_dist if self.contact_dist is not None else 5 contact_dist = self.contact_dist if self.contact_dist is not None else 5
dist_reward = self._forward_reward_weight * (-3*goal_dist + 10*self.max_height - 2*contact_dist) dist_reward = self._forward_reward_weight * (-3 * goal_dist + 10 * self.max_height - 2 * contact_dist)
rewards = dist_reward + healthy_reward rewards = dist_reward + healthy_reward
observation = self._get_obs() observation = self._get_obs()
@ -174,7 +169,7 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
'healthy_reward': self.healthy_reward * 2, 'healthy_reward': self.healthy_reward * 2,
'healthy': self.is_healthy, 'healthy': self.is_healthy,
'contact_dist': self.contact_dist if self.contact_dist is not None else 0 'contact_dist': self.contact_dist if self.contact_dist is not None else 0
} }
return observation, reward, done, info return observation, reward, done, info
def reset_model(self): def reset_model(self):
@ -225,25 +220,28 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0]) self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0])
return self._get_obs() return self._get_obs()
class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
def __init__(self, def __init__(
xml_file='hopper_jump.xml', self,
forward_reward_weight=1.0, xml_file='hopper_jump.xml',
ctrl_cost_weight=1e-3, forward_reward_weight=1.0,
healthy_reward=0.0, ctrl_cost_weight=1e-3,
penalty=0.0, healthy_reward=0.0,
context=True, penalty=0.0,
terminate_when_unhealthy=False, context=True,
healthy_state_range=(-100.0, 100.0), terminate_when_unhealthy=False,
healthy_z_range=(0.5, float('inf')), healthy_state_range=(-100.0, 100.0),
healthy_angle_range=(-float('inf'), float('inf')), healthy_z_range=(0.5, float('inf')),
reset_noise_scale=5e-3, healthy_angle_range=(-float('inf'), float('inf')),
exclude_current_positions_from_observation=False, reset_noise_scale=5e-3,
max_episode_steps=250, exclude_current_positions_from_observation=False,
height_scale = 10, max_episode_steps=250,
dist_scale = 3, height_scale=10,
healthy_scale = 2): dist_scale=3,
healthy_scale=2
):
self.height_scale = height_scale self.height_scale = height_scale
self.dist_scale = dist_scale self.dist_scale = dist_scale
self.healthy_scale = healthy_scale self.healthy_scale = healthy_scale
@ -263,15 +261,14 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
ctrl_cost = self.control_cost(action) ctrl_cost = self.control_cost(action)
healthy_reward = self.healthy_reward * self.healthy_scale healthy_reward = self.healthy_reward * self.healthy_scale
height_reward = self.height_scale*height_after height_reward = self.height_scale * height_after
goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0] goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0]
goal_dist_reward = -self.dist_scale*goal_dist goal_dist_reward = -self.dist_scale * goal_dist
dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward) dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward)
reward = -ctrl_cost + healthy_reward + dist_reward reward = -ctrl_cost + healthy_reward + dist_reward
done = False done = False
observation = self._get_obs() observation = self._get_obs()
########################################################### ###########################################################
# This is only for logging the distance to goal when first having the contact # This is only for logging the distance to goal when first having the contact
########################################################## ##########################################################
@ -297,9 +294,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
'healthy_reward': self.healthy_reward * self.healthy_reward, 'healthy_reward': self.healthy_reward * self.healthy_reward,
'healthy': self.is_healthy, 'healthy': self.is_healthy,
'contact_dist': self.contact_dist if self.contact_dist is not None else 0 'contact_dist': self.contact_dist if self.contact_dist is not None else 0
} }
return observation, reward, done, info return observation, reward, done, info
class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv): class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
def __init__(self, max_episode_steps=250): def __init__(self, max_episode_steps=250):
super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False, super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False,
@ -309,14 +307,14 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
def reset_model(self): def reset_model(self):
self._floor_geom_id = self.model.geom_name2id('floor') self._floor_geom_id = self.model.geom_name2id('floor')
self._foot_geom_id = self.model.geom_name2id('foot_geom') self._foot_geom_id = self.model.geom_name2id('foot_geom')
noise_low = -np.ones(self.model.nq)*self._reset_noise_scale noise_low = -np.ones(self.model.nq) * self._reset_noise_scale
noise_low[1] = 0 noise_low[1] = 0
noise_low[2] = 0 noise_low[2] = 0
noise_low[3] = -0.2 noise_low[3] = -0.2
noise_low[4] = -0.2 noise_low[4] = -0.2
noise_low[5] = -0.1 noise_low[5] = -0.1
noise_high = np.ones(self.model.nq)*self._reset_noise_scale noise_high = np.ones(self.model.nq) * self._reset_noise_scale
noise_high[1] = 0 noise_high[1] = 0
noise_high[2] = 0 noise_high[2] = 0
noise_high[3] = 0 noise_high[3] = 0
@ -325,7 +323,7 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
# rnd_vec[2] *= 0.05 # the angle around the y axis shouldn't be too high as the agent then falls down quickly and # rnd_vec[2] *= 0.05 # the angle around the y axis shouldn't be too high as the agent then falls down quickly and
# can not recover # can not recover
# rnd_vec[1] = np.clip(rnd_vec[1], 0, 0.3) # rnd_vec[1] = np.clip(rnd_vec[1], 0, 0.3)
qpos = self.init_qpos + rnd_vec qpos = self.init_qpos + rnd_vec
qvel = self.init_qvel qvel = self.init_qvel
@ -341,7 +339,7 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
self.do_simulation(action, self.frame_skip) self.do_simulation(action, self.frame_skip)
self.contact_with_floor = self._contact_checker(self._floor_geom_id, self._foot_geom_id) if not \ self.contact_with_floor = self._contact_checker(self._floor_geom_id, self._foot_geom_id) if not \
self.contact_with_floor else True self.contact_with_floor else True
height_after = self.get_body_com("torso")[2] height_after = self.get_body_com("torso")[2]
self.max_height = max(height_after, self.max_height) if self.contact_with_floor else 0 self.max_height = max(height_after, self.max_height) if self.contact_with_floor else 0
@ -365,12 +363,11 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
'height': height_after, 'height': height_after,
'max_height': self.max_height, 'max_height': self.max_height,
'goal': self.goal 'goal': self.goal
} }
return observation, reward, done, info return observation, reward, done, info
if __name__ == '__main__': if __name__ == '__main__':
render_mode = "human" # "human" or "partial" or "final" render_mode = "human" # "human" or "partial" or "final"
# env = ALRHopperJumpEnv() # env = ALRHopperJumpEnv()
@ -396,4 +393,4 @@ if __name__ == '__main__':
print('After ', i, ' steps, done: ', d) print('After ', i, ' steps, done: ', d)
env.reset() env.reset()
env.close() env.close()

View File

@ -9,7 +9,7 @@ from mp_pytorch.mp.mp_interfaces import MPInterface
from alr_envs.mp.controllers.base_controller import BaseController from alr_envs.mp.controllers.base_controller import BaseController
class EpisodicWrapper(gym.Env, ABC): class EpisodicWrapper(gym.Env, gym.wrappers.TransformReward, ABC):
""" """
Base class for movement primitive based gym.Wrapper implementations. Base class for movement primitive based gym.Wrapper implementations.
@ -22,14 +22,19 @@ class EpisodicWrapper(gym.Env, ABC):
weight_scale: Scaling parameter for the actions given to this wrapper weight_scale: Scaling parameter for the actions given to this wrapper
render_mode: Equivalent to gym render mode render_mode: Equivalent to gym render mode
""" """
def __init__(self,
env: gym.Env, def __init__(
mp: MPInterface, self,
controller: BaseController, env: gym.Env,
duration: float, mp: MPInterface,
render_mode: str = None, controller: BaseController,
verbose: int = 1, duration: float,
weight_scale: float = 1): render_mode: str = None,
verbose: int = 1,
weight_scale: float = 1,
sequencing=True,
reward_aggregation=np.mean,
):
super().__init__() super().__init__()
self.env = env self.env = env
@ -40,11 +45,11 @@ class EpisodicWrapper(gym.Env, ABC):
self.duration = duration self.duration = duration
self.traj_steps = int(duration / self.dt) self.traj_steps = int(duration / self.dt)
self.post_traj_steps = self.env.spec.max_episode_steps - self.traj_steps self.post_traj_steps = self.env.spec.max_episode_steps - self.traj_steps
# duration = self.env.max_episode_steps * self.dt
self.controller = controller
self.mp = mp self.mp = mp
self.env = env self.env = env
self.verbose = verbose self.controller = controller
self.weight_scale = weight_scale self.weight_scale = weight_scale
# rendering # rendering
@ -52,15 +57,18 @@ class EpisodicWrapper(gym.Env, ABC):
self.render_kwargs = {} self.render_kwargs = {}
self.time_steps = np.linspace(0, self.duration, self.traj_steps) self.time_steps = np.linspace(0, self.duration, self.traj_steps)
self.mp.set_mp_times(self.time_steps) self.mp.set_mp_times(self.time_steps)
# self.mp.set_mp_duration(self.time_steps, dt)
# action_bounds = np.inf * np.ones((np.prod(self.mp.num_params))) # action_bounds = np.inf * np.ones((np.prod(self.mp.num_params)))
self.mp_action_space = self.set_mp_action_space() self.mp_action_space = self.get_mp_action_space()
self.action_space = self.set_action_space() self.action_space = self.get_action_space()
self.active_obs = self.set_active_obs() self.active_obs = self.set_active_obs()
self.observation_space = spaces.Box(low=self.env.observation_space.low[self.active_obs], self.observation_space = spaces.Box(low=self.env.observation_space.low[self.active_obs],
high=self.env.observation_space.high[self.active_obs], high=self.env.observation_space.high[self.active_obs],
dtype=self.env.observation_space.dtype) dtype=self.env.observation_space.dtype)
self.verbose = verbose
def get_trajectory(self, action: np.ndarray) -> Tuple: def get_trajectory(self, action: np.ndarray) -> Tuple:
# TODO: this follows the implementation of the mp_pytorch library which includes the parameters tau and delay at # TODO: this follows the implementation of the mp_pytorch library which includes the parameters tau and delay at
# the beginning of the array. # the beginning of the array.
@ -69,33 +77,37 @@ class EpisodicWrapper(gym.Env, ABC):
scaled_mp_params[ignore_indices:] *= self.weight_scale scaled_mp_params[ignore_indices:] *= self.weight_scale
self.mp.set_params(np.clip(scaled_mp_params, self.mp_action_space.low, self.mp_action_space.high)) self.mp.set_params(np.clip(scaled_mp_params, self.mp_action_space.low, self.mp_action_space.high))
self.mp.set_boundary_conditions(bc_time=self.time_steps[:1], bc_pos=self.current_pos, bc_vel=self.current_vel) self.mp.set_boundary_conditions(bc_time=self.time_steps[:1], bc_pos=self.current_pos, bc_vel=self.current_vel)
traj_dict = self.mp.get_mp_trajs(get_pos = True, get_vel = True) traj_dict = self.mp.get_mp_trajs(get_pos=True, get_vel=True)
trajectory_tensor, velocity_tensor = traj_dict['pos'], traj_dict['vel'] trajectory_tensor, velocity_tensor = traj_dict['pos'], traj_dict['vel']
trajectory = trajectory_tensor.numpy() trajectory = trajectory_tensor.numpy()
velocity = velocity_tensor.numpy() velocity = velocity_tensor.numpy()
# TODO: Do we need this or does mp_pytorch have this?
if self.post_traj_steps > 0: if self.post_traj_steps > 0:
trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])]) trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dof))]) velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dof))])
return trajectory, velocity return trajectory, velocity
def set_mp_action_space(self): def get_mp_action_space(self):
"""This function can be used to set up an individual space for the parameters of the mp.""" """This function can be used to set up an individual space for the parameters of the mp."""
min_action_bounds, max_action_bounds = self.mp.get_param_bounds() min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(), mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(),
dtype=np.float32) dtype=np.float32)
return mp_action_space return mp_action_space
def set_action_space(self): def get_action_space(self):
""" """
This function can be used to modify the action space for considering actions which are not learned via motion This function can be used to modify the action space for considering actions which are not learned via motion
primitives. E.g. ball releasing time for the beer pong task. By default, it is the parameter space of the primitives. E.g. ball releasing time for the beer pong task. By default, it is the parameter space of the
motion primitive. motion primitive.
Only needs to be overwritten if the action space needs to be modified. Only needs to be overwritten if the action space needs to be modified.
""" """
return self.mp_action_space try:
return self.mp_action_space
except AttributeError:
return self.get_mp_action_space()
def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]: def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
""" """
@ -111,7 +123,8 @@ class EpisodicWrapper(gym.Env, ABC):
""" """
return action, None return action, None
def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]: def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[
np.ndarray]:
""" """
This function can be used to modify the step_action with additional parameters e.g. releasing the ball in the This function can be used to modify the step_action with additional parameters e.g. releasing the ball in the
Beerpong env. The parameters used should not be part of the motion primitive parameters. Beerpong env. The parameters used should not be part of the motion primitive parameters.
@ -169,11 +182,15 @@ class EpisodicWrapper(gym.Env, ABC):
mp_params, env_spec_params = self._episode_callback(action) mp_params, env_spec_params = self._episode_callback(action)
trajectory, velocity = self.get_trajectory(mp_params) trajectory, velocity = self.get_trajectory(mp_params)
# TODO
# self.time_steps = np.linspace(0, learned_duration, self.traj_steps)
# self.mp.set_mp_times(self.time_steps)
trajectory_length = len(trajectory) trajectory_length = len(trajectory)
if self.verbose >=2 : if self.verbose >= 2:
actions = np.zeros(shape=(trajectory_length,) + self.env.action_space.shape) actions = np.zeros(shape=(trajectory_length,) + self.env.action_space.shape)
observations = np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape, observations = np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape,
dtype=self.env.observation_space.dtype) dtype=self.env.observation_space.dtype)
rewards = np.zeros(shape=(trajectory_length,)) rewards = np.zeros(shape=(trajectory_length,))
trajectory_return = 0 trajectory_return = 0
@ -181,7 +198,7 @@ class EpisodicWrapper(gym.Env, ABC):
for t, pos_vel in enumerate(zip(trajectory, velocity)): for t, pos_vel in enumerate(zip(trajectory, velocity)):
step_action = self.controller.get_action(pos_vel[0], pos_vel[1], self.current_pos, self.current_vel) step_action = self.controller.get_action(pos_vel[0], pos_vel[1], self.current_pos, self.current_vel)
step_action = self._step_callback(t, env_spec_params, step_action) # include possible callback info step_action = self._step_callback(t, env_spec_params, step_action) # include possible callback info
c_action = np.clip(step_action, self.env.action_space.low, self.env.action_space.high) c_action = np.clip(step_action, self.env.action_space.low, self.env.action_space.high)
# print('step/clipped action ratio: ', step_action/c_action) # print('step/clipped action ratio: ', step_action/c_action)
obs, c_reward, done, info = self.env.step(c_action) obs, c_reward, done, info = self.env.step(c_action)
@ -197,7 +214,7 @@ class EpisodicWrapper(gym.Env, ABC):
# infos['step_infos'].append(info) # infos['step_infos'].append(info)
if self.render_mode: if self.render_mode:
self.render(mode=self.render_mode, **self.render_kwargs) self.render(mode=self.render_mode, **self.render_kwargs)
if done: if done or do_replanning(kwargs):
break break
infos.update({k: v[:t + 1] for k, v in infos.items()}) infos.update({k: v[:t + 1] for k, v in infos.items()})
if self.verbose >= 2: if self.verbose >= 2:
@ -235,10 +252,10 @@ class EpisodicWrapper(gym.Env, ABC):
for i in range(des_trajs.shape[1]): for i in range(des_trajs.shape[1]):
plt.figure(pos_fig.number) plt.figure(pos_fig.number)
plt.subplot(des_trajs.shape[1], 1, i + 1) plt.subplot(des_trajs.shape[1], 1, i + 1)
plt.plot(np.ones(des_trajs.shape[0])*self.current_pos[i]) plt.plot(np.ones(des_trajs.shape[0]) * self.current_pos[i])
plt.plot(des_trajs[:, i]) plt.plot(des_trajs[:, i])
plt.figure(vel_fig.number) plt.figure(vel_fig.number)
plt.subplot(des_vels.shape[1], 1, i + 1) plt.subplot(des_vels.shape[1], 1, i + 1)
plt.plot(np.ones(des_trajs.shape[0])*self.current_vel[i]) plt.plot(np.ones(des_trajs.shape[0]) * self.current_vel[i])
plt.plot(des_vels[:, i]) plt.plot(des_vels[:, i])

View File

@ -20,12 +20,13 @@ def make_dmc(
environment_kwargs: dict = {}, environment_kwargs: dict = {},
time_limit: Union[None, float] = None, time_limit: Union[None, float] = None,
channels_first: bool = True channels_first: bool = True
): ):
# Adopted from: https://github.com/denisyarats/dmc2gym/blob/master/dmc2gym/__init__.py # Adopted from: https://github.com/denisyarats/dmc2gym/blob/master/dmc2gym/__init__.py
# License: MIT # License: MIT
# Copyright (c) 2020 Denis Yarats # Copyright (c) 2020 Denis Yarats
assert re.match(r"\w+-\w+", id), "env_id does not have the following structure: 'domain_name-task_name'" if not re.match(r"\w+-\w+", id):
raise ValueError("env_id does not have the following structure: 'domain_name-task_name'")
domain_name, task_name = id.split("-") domain_name, task_name = id.split("-")
env_id = f'dmc_{domain_name}_{task_name}_{seed}-v1' env_id = f'dmc_{domain_name}_{task_name}_{seed}-v1'
@ -60,7 +61,7 @@ def make_dmc(
camera_id=camera_id, camera_id=camera_id,
frame_skip=frame_skip, frame_skip=frame_skip,
channels_first=channels_first, channels_first=channels_first,
), ),
max_episode_steps=max_episode_steps, max_episode_steps=max_episode_steps,
) )
return gym.make(env_id) return gym.make(env_id)

View File

@ -55,7 +55,7 @@ def make(env_id: str, seed, **kwargs):
Returns: Gym environment Returns: Gym environment
""" """
if any([det_pmp in env_id for det_pmp in ["DetPMP", "detpmp"]]): if any(deprec in env_id for deprec in ["DetPMP", "detpmp"]):
warnings.warn("DetPMP is deprecated and converted to ProMP") warnings.warn("DetPMP is deprecated and converted to ProMP")
env_id = env_id.replace("DetPMP", "ProMP") env_id = env_id.replace("DetPMP", "ProMP")
env_id = env_id.replace("detpmp", "promp") env_id = env_id.replace("detpmp", "promp")
@ -92,14 +92,17 @@ def make(env_id: str, seed, **kwargs):
from alr_envs import make_dmc from alr_envs import make_dmc
env = make_dmc(env_id, seed=seed, **kwargs) env = make_dmc(env_id, seed=seed, **kwargs)
assert env.base_step_limit == env.spec.max_episode_steps, \ if not env.base_step_limit == env.spec.max_episode_steps:
f"The specified 'episode_length' of {env.spec.max_episode_steps} steps for gym is different from " \ raise ValueError(f"The specified 'episode_length' of {env.spec.max_episode_steps} steps for gym "
f"the DMC environment specification of {env.base_step_limit} steps." f"is different from the DMC environment specification of {env.base_step_limit} steps.")
return env return env
def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController,
ep_wrapper_kwargs: Mapping, seed=1, **kwargs): def _make_wrapped_env(
env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController,
ep_wrapper_kwargs: Mapping, seed=1, **kwargs
):
""" """
Helper function for creating a wrapped gym environment using MPs. Helper function for creating a wrapped gym environment using MPs.
It adds all provided wrappers to the specified environment and verifies at least one MPEnvWrapper is It adds all provided wrappers to the specified environment and verifies at least one MPEnvWrapper is
@ -120,13 +123,14 @@ def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MP
# only wrap the environment if not EpisodicWrapper, e.g. for vision # only wrap the environment if not EpisodicWrapper, e.g. for vision
if not issubclass(w, EpisodicWrapper): if not issubclass(w, EpisodicWrapper):
_env = w(_env) _env = w(_env)
else: # if EpisodicWrapper, use specific constructor else: # if EpisodicWrapper, use specific constructor
has_episodic_wrapper = True has_episodic_wrapper = True
_env = w(env=_env, mp=mp, controller=controller, **ep_wrapper_kwargs) _env = w(env=_env, mp=mp, controller=controller, **ep_wrapper_kwargs)
assert has_episodic_wrapper, \ if not has_episodic_wrapper:
"At least one MPEnvWrapper is required in order to leverage motion primitive environments." raise ValueError("An EpisodicWrapper is required in order to leverage movement primitive environments.")
return _env return _env
def make_mp_from_kwargs( def make_mp_from_kwargs(
env_id: str, wrappers: Iterable, ep_wrapper_kwargs: MutableMapping, mp_kwargs: MutableMapping, env_id: str, wrappers: Iterable, ep_wrapper_kwargs: MutableMapping, mp_kwargs: MutableMapping,
controller_kwargs: MutableMapping, phase_kwargs: MutableMapping, basis_kwargs: MutableMapping, seed=1, controller_kwargs: MutableMapping, phase_kwargs: MutableMapping, basis_kwargs: MutableMapping, seed=1,
@ -152,7 +156,7 @@ def make_mp_from_kwargs(
_verify_time_limit(mp_kwargs.get("duration", None), kwargs.get("time_limit", None)) _verify_time_limit(mp_kwargs.get("duration", None), kwargs.get("time_limit", None))
dummy_env = make(env_id, seed) dummy_env = make(env_id, seed)
if ep_wrapper_kwargs.get('duration', None) is None: if ep_wrapper_kwargs.get('duration', None) is None:
ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps*dummy_env.dt ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps * dummy_env.dt
if phase_kwargs.get('tau', None) is None: if phase_kwargs.get('tau', None) is None:
phase_kwargs['tau'] = ep_wrapper_kwargs['duration'] phase_kwargs['tau'] = ep_wrapper_kwargs['duration']
mp_kwargs['action_dim'] = mp_kwargs.get('action_dim', np.prod(dummy_env.action_space.shape).item()) mp_kwargs['action_dim'] = mp_kwargs.get('action_dim', np.prod(dummy_env.action_space.shape).item())
@ -207,10 +211,11 @@ def make_mp_env_helper(**kwargs):
phase_kwargs = kwargs.pop("phase_generator_kwargs") phase_kwargs = kwargs.pop("phase_generator_kwargs")
basis_kwargs = kwargs.pop("basis_generator_kwargs") basis_kwargs = kwargs.pop("basis_generator_kwargs")
return make_mp_from_kwargs(env_id=kwargs.pop("name"), wrappers=wrappers, ep_wrapper_kwargs=ep_wrapper_kwargs, return make_mp_from_kwargs(env_id=kwargs.pop("name"), wrappers=wrappers, ep_wrapper_kwargs=ep_wrapper_kwargs,
mp_kwargs=mp_kwargs, controller_kwargs=contr_kwargs, phase_kwargs=phase_kwargs, mp_kwargs=mp_kwargs, controller_kwargs=contr_kwargs, phase_kwargs=phase_kwargs,
basis_kwargs=basis_kwargs, **kwargs, seed=seed) basis_kwargs=basis_kwargs, **kwargs, seed=seed)
def make_dmp_env(env_id: str, wrappers: Iterable, seed=1, mp_kwargs={}, **kwargs): def make_dmp_env(env_id: str, wrappers: Iterable, seed=1, mp_kwargs={}, **kwargs):
""" """
This can also be used standalone for manually building a custom DMP environment. This can also be used standalone for manually building a custom DMP environment.