first clean up and some non working ideas sketched
This commit is contained in:
parent
7bd9848c31
commit
9ad6fbe712
38
README.md
38
README.md
@ -1,26 +1,32 @@
|
|||||||
## ALR Robotics Control Environments
|
## ALR Robotics Control Environments
|
||||||
|
|
||||||
This project offers a large variety of reinforcement learning environments under the unifying interface of [OpenAI gym](https://gym.openai.com/).
|
This project offers a large variety of reinforcement learning environments under the unifying interface of [OpenAI gym](https://gym.openai.com/).
|
||||||
Besides, we also provide support (under the OpenAI interface) for the benchmark suites
|
We provide support (under the OpenAI interface) for the benchmark suites
|
||||||
[DeepMind Control](https://deepmind.com/research/publications/2020/dm-control-Software-and-Tasks-for-Continuous-Control)
|
[DeepMind Control](https://deepmind.com/research/publications/2020/dm-control-Software-and-Tasks-for-Continuous-Control)
|
||||||
(DMC) and [Metaworld](https://meta-world.github.io/). Custom (Mujoco) gym environments can be created according
|
(DMC) and [Metaworld](https://meta-world.github.io/).
|
||||||
to [this guide](https://github.com/openai/gym/blob/master/docs/creating-environments.md). Unlike existing libraries, we
|
Custom (Mujoco) gym environments can be created according
|
||||||
additionally support to control agents with Dynamic Movement Primitives (DMPs) and Probabilistic Movement Primitives (ProMP,
|
to [this guide](https://www.gymlibrary.ml/content/environment_creation/).
|
||||||
we only consider the mean usually).
|
Unlike existing libraries, we additionally support to control agents with movement primitives, such as
|
||||||
|
Dynamic Movement Primitives (DMPs) and Probabilistic Movement Primitives (ProMP, we only consider the mean usually).
|
||||||
|
|
||||||
## Motion Primitive Environments (Episodic environments)
|
## Movement Primitive Environments (Episode-Based/Black-Box Environments)
|
||||||
|
|
||||||
Unlike step-based environments, motion primitive (MP) environments are closer related to stochastic search, black box
|
Unlike step-based environments, movement primitive (MP) environments are closer related to stochastic search, black-box
|
||||||
optimization, and methods that are often used in robotics. MP environments are trajectory-based and always execute a full
|
optimization, and methods that are often used in traditional robotics and control.
|
||||||
trajectory, which is generated by a Dynamic Motion Primitive (DMP) or a Probabilistic Motion Primitive (ProMP). The
|
MP environments are episode-based and always execute a full trajectory, which is generated by a trajectory generator,
|
||||||
generated trajectory is translated into individual step-wise actions by a controller. The exact choice of controller is,
|
such as a Dynamic Movement Primitive (DMP) or a Probabilistic Movement Primitive (ProMP).
|
||||||
however, dependent on the type of environment. We currently support position, velocity, and PD-Controllers for position,
|
The generated trajectory is translated into individual step-wise actions by a trajectory tracking controller.
|
||||||
velocity, and torque control, respectively. The goal of all MP environments is still to learn a policy. Yet, an action
|
The exact choice of controller is, however, dependent on the type of environment.
|
||||||
|
We currently support position, velocity, and PD-Controllers for position, velocity, and torque control, respectively
|
||||||
|
as well as a special controller for the MetaWorld control suite.
|
||||||
|
The goal of all MP environments is still to learn a optimal policy. Yet, an action
|
||||||
represents the parametrization of the motion primitives to generate a suitable trajectory. Additionally, in this
|
represents the parametrization of the motion primitives to generate a suitable trajectory. Additionally, in this
|
||||||
framework we support all of this also for the contextual setting, for which we expose all changing substates of the
|
framework we support all of this also for the contextual setting, i.e. we expose a subset of the observation space
|
||||||
task as a single observation in the beginning. This requires to predict a new action/MP parametrization for each
|
as a single context in the beginning of the episode. This requires to predict a new action/MP parametrization for each
|
||||||
trajectory. All environments provide next to the cumulative episode reward all collected information from each
|
context.
|
||||||
step as part of the info dictionary. This information should, however, mainly be used for debugging and logging.
|
All environments provide next to the cumulative episode reward all collected information from each
|
||||||
|
step as part of the info dictionary. This information is, however, mainly meant for debugging as well as logging
|
||||||
|
and not for training.
|
||||||
|
|
||||||
|Key| Description|
|
|Key| Description|
|
||||||
|---|---|
|
|---|---|
|
||||||
|
@ -4,10 +4,11 @@ import numpy as np
|
|||||||
from gym.envs.mujoco import MujocoEnv
|
from gym.envs.mujoco import MujocoEnv
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
|
class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
|
||||||
def __init__(self, n_substeps=4, apply_gravity_comp=True, simplified: bool = False,
|
def __init__(
|
||||||
reward_type: str = None, context: np.ndarray = None):
|
self, n_substeps=4, apply_gravity_comp=True, simplified: bool = False,
|
||||||
|
reward_type: str = None, context: np.ndarray = None
|
||||||
|
):
|
||||||
utils.EzPickle.__init__(**locals())
|
utils.EzPickle.__init__(**locals())
|
||||||
self._steps = 0
|
self._steps = 0
|
||||||
|
|
||||||
@ -23,9 +24,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
|
|||||||
|
|
||||||
self.context = context
|
self.context = context
|
||||||
|
|
||||||
alr_mujoco_env.AlrMujocoEnv.__init__(self,
|
alr_mujoco_env.AlrMujocoEnv.__init__(self, self.xml_path, apply_gravity_comp=apply_gravity_comp,
|
||||||
self.xml_path,
|
|
||||||
apply_gravity_comp=apply_gravity_comp,
|
|
||||||
n_substeps=n_substeps)
|
n_substeps=n_substeps)
|
||||||
self._start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
|
self._start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
|
||||||
self._start_vel = np.zeros(7)
|
self._start_vel = np.zeros(7)
|
||||||
@ -129,7 +128,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
|
|||||||
np.sin(theta),
|
np.sin(theta),
|
||||||
# self.get_body_com("target"), # only return target to make problem harder
|
# self.get_body_com("target"), # only return target to make problem harder
|
||||||
[self._steps],
|
[self._steps],
|
||||||
])
|
])
|
||||||
|
|
||||||
# TODO
|
# TODO
|
||||||
@property
|
@property
|
||||||
@ -139,7 +138,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
|
|||||||
[False] * 7, # sin
|
[False] * 7, # sin
|
||||||
# [True] * 2, # x-y coordinates of target distance
|
# [True] * 2, # x-y coordinates of target distance
|
||||||
[False] # env steps
|
[False] # env steps
|
||||||
])
|
])
|
||||||
|
|
||||||
# These functions are for the task with 3 joint actuations
|
# These functions are for the task with 3 joint actuations
|
||||||
def extend_des_pos(self, des_pos):
|
def extend_des_pos(self, des_pos):
|
||||||
|
@ -67,7 +67,7 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
|||||||
action_cost = np.sum(np.square(action))
|
action_cost = np.sum(np.square(action))
|
||||||
self.action_costs.append(action_cost)
|
self.action_costs.append(action_cost)
|
||||||
|
|
||||||
self._is_collided = self.check_collision(self.env.sim) or self.env.check_traj_in_joint_limits()
|
self._is_collided = self.check_collision(self.env.sim) or self.env._check_traj_in_joint_limits()
|
||||||
|
|
||||||
if self.env._steps == self.env.ep_length - 1 or self._is_collided:
|
if self.env._steps == self.env.ep_length - 1 or self._is_collided:
|
||||||
t_min_dist = np.argmin(self.dists)
|
t_min_dist = np.argmin(self.dists)
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
import mujoco_py.builder
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import mujoco_py.builder
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from gym import utils, spaces
|
from gym import utils
|
||||||
from gym.envs.mujoco import MujocoEnv
|
from gym.envs.mujoco import MujocoEnv
|
||||||
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
|
|
||||||
|
|
||||||
|
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
|
||||||
|
|
||||||
CUP_POS_MIN = np.array([-1.42, -4.05])
|
CUP_POS_MIN = np.array([-1.42, -4.05])
|
||||||
CUP_POS_MAX = np.array([1.42, -1.25])
|
CUP_POS_MAX = np.array([1.42, -1.25])
|
||||||
@ -18,12 +18,14 @@ CUP_POS_MAX = np.array([1.42, -1.25])
|
|||||||
# CUP_POS_MIN = np.array([-0.16, -2.2])
|
# CUP_POS_MIN = np.array([-0.16, -2.2])
|
||||||
# CUP_POS_MAX = np.array([0.16, -1.7])
|
# CUP_POS_MAX = np.array([0.16, -1.7])
|
||||||
|
|
||||||
|
|
||||||
class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||||
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False,
|
def __init__(
|
||||||
rndm_goal=False, cup_goal_pos=None):
|
self, frame_skip=1, apply_gravity_comp=True, noisy=False,
|
||||||
|
rndm_goal=False, cup_goal_pos=None
|
||||||
|
):
|
||||||
|
|
||||||
cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840])
|
cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840])
|
||||||
if cup_goal_pos.shape[0]==2:
|
if cup_goal_pos.shape[0] == 2:
|
||||||
cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840)
|
cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840)
|
||||||
self.cup_goal_pos = np.array(cup_goal_pos)
|
self.cup_goal_pos = np.array(cup_goal_pos)
|
||||||
|
|
||||||
@ -50,7 +52,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
|||||||
# self._release_step = 130 # time step of ball release
|
# self._release_step = 130 # time step of ball release
|
||||||
self.release_step = 100 # time step of ball release
|
self.release_step = 100 # time step of ball release
|
||||||
|
|
||||||
self.ep_length = 600//frame_skip
|
self.ep_length = 600 // frame_skip
|
||||||
self.cup_table_id = 10
|
self.cup_table_id = 10
|
||||||
|
|
||||||
if noisy:
|
if noisy:
|
||||||
@ -71,14 +73,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
|||||||
def start_vel(self):
|
def start_vel(self):
|
||||||
return self._start_vel
|
return self._start_vel
|
||||||
|
|
||||||
@property
|
|
||||||
def current_pos(self):
|
|
||||||
return self.sim.data.qpos[0:7].copy()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def current_vel(self):
|
|
||||||
return self.sim.data.qvel[0:7].copy()
|
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.reward_function.reset(self.add_noise)
|
self.reward_function.reset(self.add_noise)
|
||||||
return super().reset()
|
return super().reset()
|
||||||
@ -122,7 +116,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
|||||||
self.sim.data.qpos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy()
|
self.sim.data.qpos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy()
|
||||||
self.sim.data.qvel[7::] = self.sim.data.site_xvelp[self.ball_site_id, :].copy()
|
self.sim.data.qvel[7::] = self.sim.data.site_xvelp[self.ball_site_id, :].copy()
|
||||||
elif self._steps == self.release_step and self.add_noise:
|
elif self._steps == self.release_step and self.add_noise:
|
||||||
self.sim.data.qvel[7::] += self.noise_std * np.random.randn(3)
|
self.sim.data.qvel[7::] += self.noise_std * np.random.randn(3)
|
||||||
crash = False
|
crash = False
|
||||||
except mujoco_py.builder.MujocoException:
|
except mujoco_py.builder.MujocoException:
|
||||||
crash = True
|
crash = True
|
||||||
@ -147,29 +141,32 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
|||||||
ball_pos = np.zeros(3)
|
ball_pos = np.zeros(3)
|
||||||
ball_vel = np.zeros(3)
|
ball_vel = np.zeros(3)
|
||||||
|
|
||||||
infos = dict(reward_dist=reward_dist,
|
infos = dict(
|
||||||
reward=reward,
|
reward_dist=reward_dist,
|
||||||
velocity=angular_vel,
|
reward=reward,
|
||||||
# traj=self._q_pos,
|
velocity=angular_vel,
|
||||||
action=a,
|
# traj=self._q_pos,
|
||||||
q_pos=self.sim.data.qpos[0:7].ravel().copy(),
|
action=a,
|
||||||
q_vel=self.sim.data.qvel[0:7].ravel().copy(),
|
q_pos=self.sim.data.qpos[0:7].ravel().copy(),
|
||||||
ball_pos=ball_pos,
|
q_vel=self.sim.data.qvel[0:7].ravel().copy(),
|
||||||
ball_vel=ball_vel,
|
ball_pos=ball_pos,
|
||||||
success=success,
|
ball_vel=ball_vel,
|
||||||
is_collided=is_collided, sim_crash=crash,
|
success=success,
|
||||||
table_contact_first=int(not self.reward_function.ball_ground_contact_first))
|
is_collided=is_collided, sim_crash=crash,
|
||||||
|
table_contact_first=int(not self.reward_function.ball_ground_contact_first)
|
||||||
|
)
|
||||||
infos.update(reward_infos)
|
infos.update(reward_infos)
|
||||||
return ob, reward, done, infos
|
return ob, reward, done, infos
|
||||||
|
|
||||||
def check_traj_in_joint_limits(self):
|
def _check_traj_in_joint_limits(self):
|
||||||
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
|
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
|
||||||
|
|
||||||
def _get_obs(self):
|
def _get_obs(self):
|
||||||
theta = self.sim.data.qpos.flat[:7]
|
theta = self.sim.data.qpos.flat[:7]
|
||||||
theta_dot = self.sim.data.qvel.flat[:7]
|
theta_dot = self.sim.data.qvel.flat[:7]
|
||||||
ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
|
ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
|
||||||
cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_final_table"]].copy()
|
cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[
|
||||||
|
self.sim.model._site_name2id["cup_goal_final_table"]].copy()
|
||||||
cup_goal_diff_top = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_table"]].copy()
|
cup_goal_diff_top = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_table"]].copy()
|
||||||
return np.concatenate([
|
return np.concatenate([
|
||||||
np.cos(theta),
|
np.cos(theta),
|
||||||
@ -179,17 +176,21 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
|||||||
cup_goal_diff_top,
|
cup_goal_diff_top,
|
||||||
self.sim.model.body_pos[self.cup_table_id][:2].copy(),
|
self.sim.model.body_pos[self.cup_table_id][:2].copy(),
|
||||||
[self._steps],
|
[self._steps],
|
||||||
])
|
])
|
||||||
|
|
||||||
|
def compute_reward(self):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dt(self):
|
def dt(self):
|
||||||
return super(ALRBeerBongEnv, self).dt*self.repeat_action
|
return super(ALRBeerBongEnv, self).dt * self.repeat_action
|
||||||
|
|
||||||
|
|
||||||
class ALRBeerBongEnvFixedReleaseStep(ALRBeerBongEnv):
|
class ALRBeerBongEnvFixedReleaseStep(ALRBeerBongEnv):
|
||||||
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
||||||
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
||||||
self.release_step = 62 # empirically evaluated for frame_skip=2!
|
self.release_step = 62 # empirically evaluated for frame_skip=2!
|
||||||
|
|
||||||
|
|
||||||
class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
|
class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
|
||||||
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
||||||
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
||||||
@ -202,54 +203,21 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
|
|||||||
reward = 0
|
reward = 0
|
||||||
done = False
|
done = False
|
||||||
while not done:
|
while not done:
|
||||||
sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape))
|
sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(
|
||||||
|
np.zeros(a.shape))
|
||||||
reward += sub_reward
|
reward += sub_reward
|
||||||
infos = sub_infos
|
infos = sub_infos
|
||||||
ob = sub_ob
|
ob = sub_ob
|
||||||
ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
|
ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
|
||||||
# internal steps and thus, the observation also needs to be set correctly
|
# internal steps and thus, the observation also needs to be set correctly
|
||||||
return ob, reward, done, infos
|
return ob, reward, done, infos
|
||||||
|
|
||||||
|
|
||||||
# class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
|
|
||||||
# def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
|
||||||
# super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
|
||||||
# self.release_step = 62 # empirically evaluated for frame_skip=2!
|
|
||||||
#
|
|
||||||
# def step(self, a):
|
|
||||||
# if self._steps < self.release_step:
|
|
||||||
# return super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(a)
|
|
||||||
# else:
|
|
||||||
# sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape))
|
|
||||||
# reward = sub_reward
|
|
||||||
# infos = sub_infos
|
|
||||||
# ob = sub_ob
|
|
||||||
# ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
|
|
||||||
# # internal steps and thus, the observation also needs to be set correctly
|
|
||||||
# return ob, reward, done, infos
|
|
||||||
|
|
||||||
|
|
||||||
class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
|
class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
|
||||||
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
||||||
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
||||||
self.release_step = 62 # empirically evaluated for frame_skip=2!
|
self.release_step = 62 # empirically evaluated for frame_skip=2!
|
||||||
|
|
||||||
# def _set_action_space(self):
|
|
||||||
# bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space()
|
|
||||||
# min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype)
|
|
||||||
# max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype)
|
|
||||||
# self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype)
|
|
||||||
# return self.action_space
|
|
||||||
|
|
||||||
# def step(self, a):
|
|
||||||
# self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step
|
|
||||||
# return super(ALRBeerBongEnvStepBased, self).step(a[1:])
|
|
||||||
#
|
|
||||||
# def reset(self):
|
|
||||||
# ob = super(ALRBeerBongEnvStepBased, self).reset()
|
|
||||||
# self.release_step = self.ep_length + 1
|
|
||||||
# return ob
|
|
||||||
|
|
||||||
def step(self, a):
|
def step(self, a):
|
||||||
if self._steps < self.release_step:
|
if self._steps < self.release_step:
|
||||||
return super(ALRBeerBongEnvStepBased, self).step(a)
|
return super(ALRBeerBongEnvStepBased, self).step(a)
|
||||||
@ -267,9 +235,9 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
|
|||||||
cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
|
cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
|
||||||
self.sim.model._site_name2id["cup_goal_table"]].copy())
|
self.sim.model._site_name2id["cup_goal_table"]].copy())
|
||||||
if sub_infos['success']:
|
if sub_infos['success']:
|
||||||
dist_rew = -cup_goal_dist_final**2
|
dist_rew = -cup_goal_dist_final ** 2
|
||||||
else:
|
else:
|
||||||
dist_rew = -0.5*cup_goal_dist_final**2 - cup_goal_dist_top**2
|
dist_rew = -0.5 * cup_goal_dist_final ** 2 - cup_goal_dist_top ** 2
|
||||||
reward = reward - sub_infos['action_cost'] + dist_rew
|
reward = reward - sub_infos['action_cost'] + dist_rew
|
||||||
infos = sub_infos
|
infos = sub_infos
|
||||||
ob = sub_ob
|
ob = sub_ob
|
||||||
@ -278,13 +246,13 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
|
|||||||
return ob, reward, done, infos
|
return ob, reward, done, infos
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# env = ALRBeerBongEnv(rndm_goal=True)
|
# env = ALRBeerBongEnv(rndm_goal=True)
|
||||||
# env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True)
|
# env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True)
|
||||||
# env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True)
|
# env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True)
|
||||||
env = ALRBeerBongEnvFixedReleaseStep(frame_skip=2, rndm_goal=True)
|
env = ALRBeerBongEnvFixedReleaseStep(frame_skip=2, rndm_goal=True)
|
||||||
import time
|
import time
|
||||||
|
|
||||||
env.reset()
|
env.reset()
|
||||||
env.render("human")
|
env.render("human")
|
||||||
for i in range(1500):
|
for i in range(1500):
|
||||||
|
@ -29,52 +29,38 @@ class BeerPongReward:
|
|||||||
# "cup_base_table"
|
# "cup_base_table"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
self.ball_traj = None
|
|
||||||
self.dists = None
|
self.dists = None
|
||||||
self.dists_final = None
|
self.dists_final = None
|
||||||
self.costs = None
|
|
||||||
self.action_costs = None
|
self.action_costs = None
|
||||||
self.angle_rewards = None
|
self.reset()
|
||||||
self.cup_angles = None
|
|
||||||
self.cup_z_axes = None
|
|
||||||
self.collision_penalty = 500
|
|
||||||
self.reset(None)
|
|
||||||
self.is_initialized = False
|
self.is_initialized = False
|
||||||
|
|
||||||
def reset(self, noisy):
|
def reset(self):
|
||||||
self.ball_traj = []
|
|
||||||
self.dists = []
|
self.dists = []
|
||||||
self.dists_final = []
|
self.dists_final = []
|
||||||
self.costs = []
|
|
||||||
self.action_costs = []
|
self.action_costs = []
|
||||||
self.angle_rewards = []
|
|
||||||
self.cup_angles = []
|
|
||||||
self.cup_z_axes = []
|
|
||||||
self.ball_ground_contact_first = False
|
self.ball_ground_contact_first = False
|
||||||
self.ball_table_contact = False
|
self.ball_table_contact = False
|
||||||
self.ball_wall_contact = False
|
self.ball_wall_contact = False
|
||||||
self.ball_cup_contact = False
|
self.ball_cup_contact = False
|
||||||
self.ball_in_cup = False
|
self.ball_in_cup = False
|
||||||
self.dist_ground_cup = -1 # distance floor to cup if first floor contact
|
self.dist_ground_cup = -1 # distance floor to cup if first floor contact
|
||||||
self.noisy_bp = noisy
|
|
||||||
self._t_min_final_dist = -1
|
|
||||||
|
|
||||||
def initialize(self, env):
|
def initialize(self, env):
|
||||||
|
|
||||||
if not self.is_initialized:
|
if not self.is_initialized:
|
||||||
self.is_initialized = True
|
self.is_initialized = True
|
||||||
self.ball_id = env.sim.model._body_name2id["ball"]
|
self.ball_id = env.sim.model._body_name2id["ball"]
|
||||||
self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"]
|
|
||||||
self.goal_id = env.sim.model._site_name2id["cup_goal_table"]
|
self.goal_id = env.sim.model._site_name2id["cup_goal_table"]
|
||||||
self.goal_final_id = env.sim.model._site_name2id["cup_goal_final_table"]
|
self.goal_final_id = env.sim.model._site_name2id["cup_goal_final_table"]
|
||||||
self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects]
|
|
||||||
self.cup_table_id = env.sim.model._body_name2id["cup_table"]
|
self.cup_table_id = env.sim.model._body_name2id["cup_table"]
|
||||||
|
|
||||||
|
self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"]
|
||||||
self.table_collision_id = env.sim.model._geom_name2id["table_contact_geom"]
|
self.table_collision_id = env.sim.model._geom_name2id["table_contact_geom"]
|
||||||
self.wall_collision_id = env.sim.model._geom_name2id["wall"]
|
self.wall_collision_id = env.sim.model._geom_name2id["wall"]
|
||||||
self.cup_table_collision_id = env.sim.model._geom_name2id["cup_base_table_contact"]
|
self.cup_table_collision_id = env.sim.model._geom_name2id["cup_base_table_contact"]
|
||||||
self.init_ball_pos_site_id = env.sim.model._site_name2id["init_ball_pos_site"]
|
|
||||||
self.ground_collision_id = env.sim.model._geom_name2id["ground"]
|
self.ground_collision_id = env.sim.model._geom_name2id["ground"]
|
||||||
|
self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects]
|
||||||
self.robot_collision_ids = [env.sim.model._geom_name2id[name] for name in self.robot_collision_objects]
|
self.robot_collision_ids = [env.sim.model._geom_name2id[name] for name in self.robot_collision_objects]
|
||||||
|
|
||||||
def compute_reward(self, env, action):
|
def compute_reward(self, env, action):
|
||||||
@ -87,7 +73,7 @@ class BeerPongReward:
|
|||||||
self.check_contacts(env.sim)
|
self.check_contacts(env.sim)
|
||||||
self.dists.append(np.linalg.norm(goal_pos - ball_pos))
|
self.dists.append(np.linalg.norm(goal_pos - ball_pos))
|
||||||
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
|
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
|
||||||
self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \
|
self.dist_ground_cup = np.linalg.norm(ball_pos - goal_pos) \
|
||||||
if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup
|
if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup
|
||||||
action_cost = np.sum(np.square(action))
|
action_cost = np.sum(np.square(action))
|
||||||
self.action_costs.append(np.copy(action_cost))
|
self.action_costs.append(np.copy(action_cost))
|
||||||
@ -99,7 +85,7 @@ class BeerPongReward:
|
|||||||
min_dist = np.min(self.dists)
|
min_dist = np.min(self.dists)
|
||||||
final_dist = self.dists_final[-1]
|
final_dist = self.dists_final[-1]
|
||||||
if self.ball_ground_contact_first:
|
if self.ball_ground_contact_first:
|
||||||
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 2, -4 # relative rew offset when first bounding on ground
|
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 2, -4 # relative rew offset when first bounding on ground
|
||||||
# min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -6 # absolute rew offset when first bouncing on ground
|
# min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -6 # absolute rew offset when first bouncing on ground
|
||||||
else:
|
else:
|
||||||
if not self.ball_in_cup:
|
if not self.ball_in_cup:
|
||||||
@ -108,18 +94,18 @@ class BeerPongReward:
|
|||||||
else:
|
else:
|
||||||
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -2
|
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -2
|
||||||
else:
|
else:
|
||||||
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0
|
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0, 0
|
||||||
# dist_ground_cup = 1 * self.dist_ground_cup
|
# dist_ground_cup = 1 * self.dist_ground_cup
|
||||||
action_cost = 1e-4 * np.mean(action_cost)
|
action_cost = 1e-4 * np.mean(action_cost)
|
||||||
reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \
|
reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \
|
||||||
action_cost - ground_contact_dist_coeff*self.dist_ground_cup ** 2
|
action_cost - ground_contact_dist_coeff * self.dist_ground_cup ** 2
|
||||||
# 1e-7*np.mean(action_cost)
|
# 1e-7*np.mean(action_cost)
|
||||||
# release step punishment
|
# release step punishment
|
||||||
min_time_bound = 0.1
|
min_time_bound = 0.1
|
||||||
max_time_bound = 1.0
|
max_time_bound = 1.0
|
||||||
release_time = env.release_step*env.dt
|
release_time = env.release_step * env.dt
|
||||||
release_time_rew = int(release_time<min_time_bound)*(-30-10*(release_time-min_time_bound)**2) \
|
release_time_rew = int(release_time < min_time_bound) * (-30 - 10 * (release_time - min_time_bound) ** 2) \
|
||||||
+int(release_time>max_time_bound)*(-30-10*(release_time-max_time_bound)**2)
|
+ int(release_time > max_time_bound) * (-30 - 10 * (release_time - max_time_bound) ** 2)
|
||||||
reward += release_time_rew
|
reward += release_time_rew
|
||||||
success = self.ball_in_cup
|
success = self.ball_in_cup
|
||||||
# print('release time :', release_time)
|
# print('release time :', release_time)
|
||||||
@ -147,17 +133,17 @@ class BeerPongReward:
|
|||||||
self.table_collision_id)
|
self.table_collision_id)
|
||||||
if not self.ball_cup_contact:
|
if not self.ball_cup_contact:
|
||||||
self.ball_cup_contact = self._check_collision_with_set_of_objects(sim, self.ball_collision_id,
|
self.ball_cup_contact = self._check_collision_with_set_of_objects(sim, self.ball_collision_id,
|
||||||
self.cup_collision_ids)
|
self.cup_collision_ids)
|
||||||
if not self.ball_wall_contact:
|
if not self.ball_wall_contact:
|
||||||
self.ball_wall_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
|
self.ball_wall_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
|
||||||
self.wall_collision_id)
|
self.wall_collision_id)
|
||||||
if not self.ball_in_cup:
|
if not self.ball_in_cup:
|
||||||
self.ball_in_cup = self._check_collision_single_objects(sim, self.ball_collision_id,
|
self.ball_in_cup = self._check_collision_single_objects(sim, self.ball_collision_id,
|
||||||
self.cup_table_collision_id)
|
self.cup_table_collision_id)
|
||||||
if not self.ball_ground_contact_first:
|
if not self.ball_ground_contact_first:
|
||||||
if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact and not self.ball_in_cup:
|
if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact and not self.ball_in_cup:
|
||||||
self.ball_ground_contact_first = self._check_collision_single_objects(sim, self.ball_collision_id,
|
self.ball_ground_contact_first = self._check_collision_single_objects(sim, self.ball_collision_id,
|
||||||
self.ground_collision_id)
|
self.ground_collision_id)
|
||||||
|
|
||||||
def _check_collision_single_objects(self, sim, id_1, id_2):
|
def _check_collision_single_objects(self, sim, id_1, id_2):
|
||||||
for coni in range(0, sim.data.ncon):
|
for coni in range(0, sim.data.ncon):
|
||||||
|
@ -1,10 +1,15 @@
|
|||||||
from alr_envs.mp.episodic_wrapper import EpisodicWrapper
|
from typing import Tuple, Union
|
||||||
from typing import Union, Tuple
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import gym
|
|
||||||
|
from alr_envs.mp.episodic_wrapper import EpisodicWrapper
|
||||||
|
|
||||||
|
|
||||||
class NewMPWrapper(EpisodicWrapper):
|
class NewMPWrapper(EpisodicWrapper):
|
||||||
|
|
||||||
|
# def __init__(self, replanning_model):
|
||||||
|
# self.replanning_model = replanning_model
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def current_pos(self) -> Union[float, int, np.ndarray, Tuple]:
|
def current_pos(self) -> Union[float, int, np.ndarray, Tuple]:
|
||||||
return self.env.sim.data.qpos[0:7].copy()
|
return self.env.sim.data.qpos[0:7].copy()
|
||||||
@ -22,26 +27,16 @@ class NewMPWrapper(EpisodicWrapper):
|
|||||||
[False] * 3, # cup_goal_diff_top
|
[False] * 3, # cup_goal_diff_top
|
||||||
[True] * 2, # xy position of cup
|
[True] * 2, # xy position of cup
|
||||||
[False] # env steps
|
[False] # env steps
|
||||||
])
|
])
|
||||||
|
|
||||||
# def set_mp_action_space(self):
|
def do_replanning(self, pos, vel, s, a, t, last_replan_step):
|
||||||
# min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
|
return False
|
||||||
# if self.mp.learn_tau:
|
# const = np.arange(0, 1000, 10)
|
||||||
# min_action_bounds[0] = 20*self.env.dt
|
# return bool(self.replanning_model(s))
|
||||||
# max_action_bounds[0] = 260*self.env.dt
|
|
||||||
# mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(),
|
|
||||||
# dtype=np.float32)
|
|
||||||
# return mp_action_space
|
|
||||||
|
|
||||||
# def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]:
|
|
||||||
# if self.mp.learn_tau:
|
|
||||||
# return np.concatenate((step_action, np.atleast_1d(env_spec_params)))
|
|
||||||
# else:
|
|
||||||
# return step_action
|
|
||||||
|
|
||||||
def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
|
def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
|
||||||
if self.mp.learn_tau:
|
if self.mp.learn_tau:
|
||||||
self.env.env.release_step = action[0]/self.env.dt # Tau value
|
self.env.env.release_step = action[0] / self.env.dt # Tau value
|
||||||
return action, None
|
return action, None
|
||||||
else:
|
else:
|
||||||
return action, None
|
return action, None
|
||||||
@ -52,12 +47,3 @@ class NewMPWrapper(EpisodicWrapper):
|
|||||||
xyz[-1] = 0.840
|
xyz[-1] = 0.840
|
||||||
self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz
|
self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz
|
||||||
return self.get_observation_from_step(self.env.env._get_obs())
|
return self.get_observation_from_step(self.env.env._get_obs())
|
||||||
# def set_action_space(self):
|
|
||||||
# if self.mp.learn_tau:
|
|
||||||
# min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
|
|
||||||
# min_action_bounds = np.concatenate((min_action_bounds.numpy(), [self.env.action_space.low[-1]]))
|
|
||||||
# max_action_bounds = np.concatenate((max_action_bounds.numpy(), [self.env.action_space.high[-1]]))
|
|
||||||
# self.action_space = gym.spaces.Box(low=min_action_bounds, high=max_action_bounds, dtype=np.float32)
|
|
||||||
# return self.action_space
|
|
||||||
# else:
|
|
||||||
# return super(NewMPWrapper, self).set_action_space()
|
|
||||||
|
@ -14,20 +14,23 @@ class ALRHopperJumpEnv(HopperEnv):
|
|||||||
- exclude current positions from observatiosn is set to False
|
- exclude current positions from observatiosn is set to False
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(
|
||||||
xml_file='hopper_jump.xml',
|
self,
|
||||||
forward_reward_weight=1.0,
|
xml_file='hopper_jump.xml',
|
||||||
ctrl_cost_weight=1e-3,
|
forward_reward_weight=1.0,
|
||||||
healthy_reward=0.0,
|
ctrl_cost_weight=1e-3,
|
||||||
penalty=0.0,
|
healthy_reward=0.0,
|
||||||
context=True,
|
penalty=0.0,
|
||||||
terminate_when_unhealthy=False,
|
context=True,
|
||||||
healthy_state_range=(-100.0, 100.0),
|
terminate_when_unhealthy=False,
|
||||||
healthy_z_range=(0.5, float('inf')),
|
healthy_state_range=(-100.0, 100.0),
|
||||||
healthy_angle_range=(-float('inf'), float('inf')),
|
healthy_z_range=(0.5, float('inf')),
|
||||||
reset_noise_scale=5e-3,
|
healthy_angle_range=(-float('inf'), float('inf')),
|
||||||
exclude_current_positions_from_observation=False,
|
reset_noise_scale=5e-3,
|
||||||
max_episode_steps=250):
|
exclude_current_positions_from_observation=False,
|
||||||
|
max_episode_steps=250
|
||||||
|
):
|
||||||
|
|
||||||
self.current_step = 0
|
self.current_step = 0
|
||||||
self.max_height = 0
|
self.max_height = 0
|
||||||
self.max_episode_steps = max_episode_steps
|
self.max_episode_steps = max_episode_steps
|
||||||
@ -59,22 +62,14 @@ class ALRHopperJumpEnv(HopperEnv):
|
|||||||
done = False
|
done = False
|
||||||
rewards = 0
|
rewards = 0
|
||||||
if self.current_step >= self.max_episode_steps:
|
if self.current_step >= self.max_episode_steps:
|
||||||
hight_goal_distance = -10*np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height
|
hight_goal_distance = -10 * np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height
|
||||||
healthy_reward = 0 if self.context else self.healthy_reward * 2 # self.current_step
|
healthy_reward = 0 if self.context else self.healthy_reward * 2 # self.current_step
|
||||||
height_reward = self._forward_reward_weight * hight_goal_distance # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley
|
height_reward = self._forward_reward_weight * hight_goal_distance # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley
|
||||||
rewards = height_reward + healthy_reward
|
rewards = height_reward + healthy_reward
|
||||||
|
|
||||||
# else:
|
|
||||||
# # penalty for wrong start direction of first two joints; not needed, could be removed
|
|
||||||
# rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0
|
|
||||||
|
|
||||||
observation = self._get_obs()
|
observation = self._get_obs()
|
||||||
reward = rewards - costs
|
reward = rewards - costs
|
||||||
# info = {
|
|
||||||
# 'height' : height_after,
|
|
||||||
# 'max_height': self.max_height,
|
|
||||||
# 'goal' : self.goal
|
|
||||||
# }
|
|
||||||
info = {
|
info = {
|
||||||
'height': height_after,
|
'height': height_after,
|
||||||
'x_pos': site_pos_after,
|
'x_pos': site_pos_after,
|
||||||
@ -82,7 +77,7 @@ class ALRHopperJumpEnv(HopperEnv):
|
|||||||
'height_rew': self.max_height,
|
'height_rew': self.max_height,
|
||||||
'healthy_reward': self.healthy_reward * 2,
|
'healthy_reward': self.healthy_reward * 2,
|
||||||
'healthy': self.is_healthy
|
'healthy': self.is_healthy
|
||||||
}
|
}
|
||||||
|
|
||||||
return observation, reward, done, info
|
return observation, reward, done, info
|
||||||
|
|
||||||
@ -90,7 +85,7 @@ class ALRHopperJumpEnv(HopperEnv):
|
|||||||
return np.append(super()._get_obs(), self.goal)
|
return np.append(super()._get_obs(), self.goal)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.goal = self.np_random.uniform(1.4, 2.16, 1)[0] # 1.3 2.3
|
self.goal = self.np_random.uniform(1.4, 2.16, 1)[0] # 1.3 2.3
|
||||||
self.max_height = 0
|
self.max_height = 0
|
||||||
self.current_step = 0
|
self.current_step = 0
|
||||||
return super().reset()
|
return super().reset()
|
||||||
@ -98,8 +93,8 @@ class ALRHopperJumpEnv(HopperEnv):
|
|||||||
# overwrite reset_model to make it deterministic
|
# overwrite reset_model to make it deterministic
|
||||||
def reset_model(self):
|
def reset_model(self):
|
||||||
|
|
||||||
qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
|
qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
|
||||||
qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv)
|
qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv)
|
||||||
|
|
||||||
self.set_state(qpos, qvel)
|
self.set_state(qpos, qvel)
|
||||||
|
|
||||||
@ -147,7 +142,7 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
|
|||||||
self.contact_with_floor = floor_contact
|
self.contact_with_floor = floor_contact
|
||||||
|
|
||||||
if self.contact_dist is None and self.contact_with_floor:
|
if self.contact_dist is None and self.contact_with_floor:
|
||||||
self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')]
|
self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')]
|
||||||
- np.array([self.goal, 0, 0]))
|
- np.array([self.goal, 0, 0]))
|
||||||
|
|
||||||
ctrl_cost = self.control_cost(action)
|
ctrl_cost = self.control_cost(action)
|
||||||
@ -157,9 +152,9 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
|
|||||||
rewards = 0
|
rewards = 0
|
||||||
if self.current_step >= self.max_episode_steps:
|
if self.current_step >= self.max_episode_steps:
|
||||||
# healthy_reward = 0 if self.context else self.healthy_reward * self.current_step
|
# healthy_reward = 0 if self.context else self.healthy_reward * self.current_step
|
||||||
healthy_reward = self.healthy_reward * 2 #* self.current_step
|
healthy_reward = self.healthy_reward * 2 # * self.current_step
|
||||||
contact_dist = self.contact_dist if self.contact_dist is not None else 5
|
contact_dist = self.contact_dist if self.contact_dist is not None else 5
|
||||||
dist_reward = self._forward_reward_weight * (-3*goal_dist + 10*self.max_height - 2*contact_dist)
|
dist_reward = self._forward_reward_weight * (-3 * goal_dist + 10 * self.max_height - 2 * contact_dist)
|
||||||
rewards = dist_reward + healthy_reward
|
rewards = dist_reward + healthy_reward
|
||||||
|
|
||||||
observation = self._get_obs()
|
observation = self._get_obs()
|
||||||
@ -174,7 +169,7 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
|
|||||||
'healthy_reward': self.healthy_reward * 2,
|
'healthy_reward': self.healthy_reward * 2,
|
||||||
'healthy': self.is_healthy,
|
'healthy': self.is_healthy,
|
||||||
'contact_dist': self.contact_dist if self.contact_dist is not None else 0
|
'contact_dist': self.contact_dist if self.contact_dist is not None else 0
|
||||||
}
|
}
|
||||||
return observation, reward, done, info
|
return observation, reward, done, info
|
||||||
|
|
||||||
def reset_model(self):
|
def reset_model(self):
|
||||||
@ -225,25 +220,28 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
|
|||||||
self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0])
|
self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0])
|
||||||
return self._get_obs()
|
return self._get_obs()
|
||||||
|
|
||||||
|
|
||||||
class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
|
class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(
|
||||||
xml_file='hopper_jump.xml',
|
self,
|
||||||
forward_reward_weight=1.0,
|
xml_file='hopper_jump.xml',
|
||||||
ctrl_cost_weight=1e-3,
|
forward_reward_weight=1.0,
|
||||||
healthy_reward=0.0,
|
ctrl_cost_weight=1e-3,
|
||||||
penalty=0.0,
|
healthy_reward=0.0,
|
||||||
context=True,
|
penalty=0.0,
|
||||||
terminate_when_unhealthy=False,
|
context=True,
|
||||||
healthy_state_range=(-100.0, 100.0),
|
terminate_when_unhealthy=False,
|
||||||
healthy_z_range=(0.5, float('inf')),
|
healthy_state_range=(-100.0, 100.0),
|
||||||
healthy_angle_range=(-float('inf'), float('inf')),
|
healthy_z_range=(0.5, float('inf')),
|
||||||
reset_noise_scale=5e-3,
|
healthy_angle_range=(-float('inf'), float('inf')),
|
||||||
exclude_current_positions_from_observation=False,
|
reset_noise_scale=5e-3,
|
||||||
max_episode_steps=250,
|
exclude_current_positions_from_observation=False,
|
||||||
height_scale = 10,
|
max_episode_steps=250,
|
||||||
dist_scale = 3,
|
height_scale=10,
|
||||||
healthy_scale = 2):
|
dist_scale=3,
|
||||||
|
healthy_scale=2
|
||||||
|
):
|
||||||
self.height_scale = height_scale
|
self.height_scale = height_scale
|
||||||
self.dist_scale = dist_scale
|
self.dist_scale = dist_scale
|
||||||
self.healthy_scale = healthy_scale
|
self.healthy_scale = healthy_scale
|
||||||
@ -263,15 +261,14 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
|
|||||||
ctrl_cost = self.control_cost(action)
|
ctrl_cost = self.control_cost(action)
|
||||||
|
|
||||||
healthy_reward = self.healthy_reward * self.healthy_scale
|
healthy_reward = self.healthy_reward * self.healthy_scale
|
||||||
height_reward = self.height_scale*height_after
|
height_reward = self.height_scale * height_after
|
||||||
goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0]
|
goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0]
|
||||||
goal_dist_reward = -self.dist_scale*goal_dist
|
goal_dist_reward = -self.dist_scale * goal_dist
|
||||||
dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward)
|
dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward)
|
||||||
reward = -ctrl_cost + healthy_reward + dist_reward
|
reward = -ctrl_cost + healthy_reward + dist_reward
|
||||||
done = False
|
done = False
|
||||||
observation = self._get_obs()
|
observation = self._get_obs()
|
||||||
|
|
||||||
|
|
||||||
###########################################################
|
###########################################################
|
||||||
# This is only for logging the distance to goal when first having the contact
|
# This is only for logging the distance to goal when first having the contact
|
||||||
##########################################################
|
##########################################################
|
||||||
@ -297,9 +294,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
|
|||||||
'healthy_reward': self.healthy_reward * self.healthy_reward,
|
'healthy_reward': self.healthy_reward * self.healthy_reward,
|
||||||
'healthy': self.is_healthy,
|
'healthy': self.is_healthy,
|
||||||
'contact_dist': self.contact_dist if self.contact_dist is not None else 0
|
'contact_dist': self.contact_dist if self.contact_dist is not None else 0
|
||||||
}
|
}
|
||||||
return observation, reward, done, info
|
return observation, reward, done, info
|
||||||
|
|
||||||
|
|
||||||
class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
|
class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
|
||||||
def __init__(self, max_episode_steps=250):
|
def __init__(self, max_episode_steps=250):
|
||||||
super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False,
|
super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False,
|
||||||
@ -309,14 +307,14 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
|
|||||||
def reset_model(self):
|
def reset_model(self):
|
||||||
self._floor_geom_id = self.model.geom_name2id('floor')
|
self._floor_geom_id = self.model.geom_name2id('floor')
|
||||||
self._foot_geom_id = self.model.geom_name2id('foot_geom')
|
self._foot_geom_id = self.model.geom_name2id('foot_geom')
|
||||||
noise_low = -np.ones(self.model.nq)*self._reset_noise_scale
|
noise_low = -np.ones(self.model.nq) * self._reset_noise_scale
|
||||||
noise_low[1] = 0
|
noise_low[1] = 0
|
||||||
noise_low[2] = 0
|
noise_low[2] = 0
|
||||||
noise_low[3] = -0.2
|
noise_low[3] = -0.2
|
||||||
noise_low[4] = -0.2
|
noise_low[4] = -0.2
|
||||||
noise_low[5] = -0.1
|
noise_low[5] = -0.1
|
||||||
|
|
||||||
noise_high = np.ones(self.model.nq)*self._reset_noise_scale
|
noise_high = np.ones(self.model.nq) * self._reset_noise_scale
|
||||||
noise_high[1] = 0
|
noise_high[1] = 0
|
||||||
noise_high[2] = 0
|
noise_high[2] = 0
|
||||||
noise_high[3] = 0
|
noise_high[3] = 0
|
||||||
@ -325,7 +323,7 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
|
|||||||
|
|
||||||
rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
|
rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
|
||||||
# rnd_vec[2] *= 0.05 # the angle around the y axis shouldn't be too high as the agent then falls down quickly and
|
# rnd_vec[2] *= 0.05 # the angle around the y axis shouldn't be too high as the agent then falls down quickly and
|
||||||
# can not recover
|
# can not recover
|
||||||
# rnd_vec[1] = np.clip(rnd_vec[1], 0, 0.3)
|
# rnd_vec[1] = np.clip(rnd_vec[1], 0, 0.3)
|
||||||
qpos = self.init_qpos + rnd_vec
|
qpos = self.init_qpos + rnd_vec
|
||||||
qvel = self.init_qvel
|
qvel = self.init_qvel
|
||||||
@ -341,7 +339,7 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
|
|||||||
self.do_simulation(action, self.frame_skip)
|
self.do_simulation(action, self.frame_skip)
|
||||||
|
|
||||||
self.contact_with_floor = self._contact_checker(self._floor_geom_id, self._foot_geom_id) if not \
|
self.contact_with_floor = self._contact_checker(self._floor_geom_id, self._foot_geom_id) if not \
|
||||||
self.contact_with_floor else True
|
self.contact_with_floor else True
|
||||||
|
|
||||||
height_after = self.get_body_com("torso")[2]
|
height_after = self.get_body_com("torso")[2]
|
||||||
self.max_height = max(height_after, self.max_height) if self.contact_with_floor else 0
|
self.max_height = max(height_after, self.max_height) if self.contact_with_floor else 0
|
||||||
@ -365,12 +363,11 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
|
|||||||
'height': height_after,
|
'height': height_after,
|
||||||
'max_height': self.max_height,
|
'max_height': self.max_height,
|
||||||
'goal': self.goal
|
'goal': self.goal
|
||||||
}
|
}
|
||||||
|
|
||||||
return observation, reward, done, info
|
return observation, reward, done, info
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
render_mode = "human" # "human" or "partial" or "final"
|
render_mode = "human" # "human" or "partial" or "final"
|
||||||
# env = ALRHopperJumpEnv()
|
# env = ALRHopperJumpEnv()
|
||||||
|
@ -9,7 +9,7 @@ from mp_pytorch.mp.mp_interfaces import MPInterface
|
|||||||
from alr_envs.mp.controllers.base_controller import BaseController
|
from alr_envs.mp.controllers.base_controller import BaseController
|
||||||
|
|
||||||
|
|
||||||
class EpisodicWrapper(gym.Env, ABC):
|
class EpisodicWrapper(gym.Env, gym.wrappers.TransformReward, ABC):
|
||||||
"""
|
"""
|
||||||
Base class for movement primitive based gym.Wrapper implementations.
|
Base class for movement primitive based gym.Wrapper implementations.
|
||||||
|
|
||||||
@ -22,14 +22,19 @@ class EpisodicWrapper(gym.Env, ABC):
|
|||||||
weight_scale: Scaling parameter for the actions given to this wrapper
|
weight_scale: Scaling parameter for the actions given to this wrapper
|
||||||
render_mode: Equivalent to gym render mode
|
render_mode: Equivalent to gym render mode
|
||||||
"""
|
"""
|
||||||
def __init__(self,
|
|
||||||
env: gym.Env,
|
def __init__(
|
||||||
mp: MPInterface,
|
self,
|
||||||
controller: BaseController,
|
env: gym.Env,
|
||||||
duration: float,
|
mp: MPInterface,
|
||||||
render_mode: str = None,
|
controller: BaseController,
|
||||||
verbose: int = 1,
|
duration: float,
|
||||||
weight_scale: float = 1):
|
render_mode: str = None,
|
||||||
|
verbose: int = 1,
|
||||||
|
weight_scale: float = 1,
|
||||||
|
sequencing=True,
|
||||||
|
reward_aggregation=np.mean,
|
||||||
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.env = env
|
self.env = env
|
||||||
@ -40,11 +45,11 @@ class EpisodicWrapper(gym.Env, ABC):
|
|||||||
self.duration = duration
|
self.duration = duration
|
||||||
self.traj_steps = int(duration / self.dt)
|
self.traj_steps = int(duration / self.dt)
|
||||||
self.post_traj_steps = self.env.spec.max_episode_steps - self.traj_steps
|
self.post_traj_steps = self.env.spec.max_episode_steps - self.traj_steps
|
||||||
|
# duration = self.env.max_episode_steps * self.dt
|
||||||
|
|
||||||
self.controller = controller
|
|
||||||
self.mp = mp
|
self.mp = mp
|
||||||
self.env = env
|
self.env = env
|
||||||
self.verbose = verbose
|
self.controller = controller
|
||||||
self.weight_scale = weight_scale
|
self.weight_scale = weight_scale
|
||||||
|
|
||||||
# rendering
|
# rendering
|
||||||
@ -52,15 +57,18 @@ class EpisodicWrapper(gym.Env, ABC):
|
|||||||
self.render_kwargs = {}
|
self.render_kwargs = {}
|
||||||
self.time_steps = np.linspace(0, self.duration, self.traj_steps)
|
self.time_steps = np.linspace(0, self.duration, self.traj_steps)
|
||||||
self.mp.set_mp_times(self.time_steps)
|
self.mp.set_mp_times(self.time_steps)
|
||||||
|
# self.mp.set_mp_duration(self.time_steps, dt)
|
||||||
# action_bounds = np.inf * np.ones((np.prod(self.mp.num_params)))
|
# action_bounds = np.inf * np.ones((np.prod(self.mp.num_params)))
|
||||||
self.mp_action_space = self.set_mp_action_space()
|
self.mp_action_space = self.get_mp_action_space()
|
||||||
|
|
||||||
self.action_space = self.set_action_space()
|
self.action_space = self.get_action_space()
|
||||||
self.active_obs = self.set_active_obs()
|
self.active_obs = self.set_active_obs()
|
||||||
self.observation_space = spaces.Box(low=self.env.observation_space.low[self.active_obs],
|
self.observation_space = spaces.Box(low=self.env.observation_space.low[self.active_obs],
|
||||||
high=self.env.observation_space.high[self.active_obs],
|
high=self.env.observation_space.high[self.active_obs],
|
||||||
dtype=self.env.observation_space.dtype)
|
dtype=self.env.observation_space.dtype)
|
||||||
|
|
||||||
|
self.verbose = verbose
|
||||||
|
|
||||||
def get_trajectory(self, action: np.ndarray) -> Tuple:
|
def get_trajectory(self, action: np.ndarray) -> Tuple:
|
||||||
# TODO: this follows the implementation of the mp_pytorch library which includes the parameters tau and delay at
|
# TODO: this follows the implementation of the mp_pytorch library which includes the parameters tau and delay at
|
||||||
# the beginning of the array.
|
# the beginning of the array.
|
||||||
@ -69,33 +77,37 @@ class EpisodicWrapper(gym.Env, ABC):
|
|||||||
scaled_mp_params[ignore_indices:] *= self.weight_scale
|
scaled_mp_params[ignore_indices:] *= self.weight_scale
|
||||||
self.mp.set_params(np.clip(scaled_mp_params, self.mp_action_space.low, self.mp_action_space.high))
|
self.mp.set_params(np.clip(scaled_mp_params, self.mp_action_space.low, self.mp_action_space.high))
|
||||||
self.mp.set_boundary_conditions(bc_time=self.time_steps[:1], bc_pos=self.current_pos, bc_vel=self.current_vel)
|
self.mp.set_boundary_conditions(bc_time=self.time_steps[:1], bc_pos=self.current_pos, bc_vel=self.current_vel)
|
||||||
traj_dict = self.mp.get_mp_trajs(get_pos = True, get_vel = True)
|
traj_dict = self.mp.get_mp_trajs(get_pos=True, get_vel=True)
|
||||||
trajectory_tensor, velocity_tensor = traj_dict['pos'], traj_dict['vel']
|
trajectory_tensor, velocity_tensor = traj_dict['pos'], traj_dict['vel']
|
||||||
|
|
||||||
trajectory = trajectory_tensor.numpy()
|
trajectory = trajectory_tensor.numpy()
|
||||||
velocity = velocity_tensor.numpy()
|
velocity = velocity_tensor.numpy()
|
||||||
|
|
||||||
|
# TODO: Do we need this or does mp_pytorch have this?
|
||||||
if self.post_traj_steps > 0:
|
if self.post_traj_steps > 0:
|
||||||
trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
|
trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
|
||||||
velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dof))])
|
velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dof))])
|
||||||
|
|
||||||
return trajectory, velocity
|
return trajectory, velocity
|
||||||
|
|
||||||
def set_mp_action_space(self):
|
def get_mp_action_space(self):
|
||||||
"""This function can be used to set up an individual space for the parameters of the mp."""
|
"""This function can be used to set up an individual space for the parameters of the mp."""
|
||||||
min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
|
min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
|
||||||
mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(),
|
mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(),
|
||||||
dtype=np.float32)
|
dtype=np.float32)
|
||||||
return mp_action_space
|
return mp_action_space
|
||||||
|
|
||||||
def set_action_space(self):
|
def get_action_space(self):
|
||||||
"""
|
"""
|
||||||
This function can be used to modify the action space for considering actions which are not learned via motion
|
This function can be used to modify the action space for considering actions which are not learned via motion
|
||||||
primitives. E.g. ball releasing time for the beer pong task. By default, it is the parameter space of the
|
primitives. E.g. ball releasing time for the beer pong task. By default, it is the parameter space of the
|
||||||
motion primitive.
|
motion primitive.
|
||||||
Only needs to be overwritten if the action space needs to be modified.
|
Only needs to be overwritten if the action space needs to be modified.
|
||||||
"""
|
"""
|
||||||
return self.mp_action_space
|
try:
|
||||||
|
return self.mp_action_space
|
||||||
|
except AttributeError:
|
||||||
|
return self.get_mp_action_space()
|
||||||
|
|
||||||
def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
|
def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
|
||||||
"""
|
"""
|
||||||
@ -111,7 +123,8 @@ class EpisodicWrapper(gym.Env, ABC):
|
|||||||
"""
|
"""
|
||||||
return action, None
|
return action, None
|
||||||
|
|
||||||
def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]:
|
def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[
|
||||||
|
np.ndarray]:
|
||||||
"""
|
"""
|
||||||
This function can be used to modify the step_action with additional parameters e.g. releasing the ball in the
|
This function can be used to modify the step_action with additional parameters e.g. releasing the ball in the
|
||||||
Beerpong env. The parameters used should not be part of the motion primitive parameters.
|
Beerpong env. The parameters used should not be part of the motion primitive parameters.
|
||||||
@ -169,11 +182,15 @@ class EpisodicWrapper(gym.Env, ABC):
|
|||||||
mp_params, env_spec_params = self._episode_callback(action)
|
mp_params, env_spec_params = self._episode_callback(action)
|
||||||
trajectory, velocity = self.get_trajectory(mp_params)
|
trajectory, velocity = self.get_trajectory(mp_params)
|
||||||
|
|
||||||
|
# TODO
|
||||||
|
# self.time_steps = np.linspace(0, learned_duration, self.traj_steps)
|
||||||
|
# self.mp.set_mp_times(self.time_steps)
|
||||||
|
|
||||||
trajectory_length = len(trajectory)
|
trajectory_length = len(trajectory)
|
||||||
if self.verbose >=2 :
|
if self.verbose >= 2:
|
||||||
actions = np.zeros(shape=(trajectory_length,) + self.env.action_space.shape)
|
actions = np.zeros(shape=(trajectory_length,) + self.env.action_space.shape)
|
||||||
observations = np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape,
|
observations = np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape,
|
||||||
dtype=self.env.observation_space.dtype)
|
dtype=self.env.observation_space.dtype)
|
||||||
rewards = np.zeros(shape=(trajectory_length,))
|
rewards = np.zeros(shape=(trajectory_length,))
|
||||||
trajectory_return = 0
|
trajectory_return = 0
|
||||||
|
|
||||||
@ -181,7 +198,7 @@ class EpisodicWrapper(gym.Env, ABC):
|
|||||||
|
|
||||||
for t, pos_vel in enumerate(zip(trajectory, velocity)):
|
for t, pos_vel in enumerate(zip(trajectory, velocity)):
|
||||||
step_action = self.controller.get_action(pos_vel[0], pos_vel[1], self.current_pos, self.current_vel)
|
step_action = self.controller.get_action(pos_vel[0], pos_vel[1], self.current_pos, self.current_vel)
|
||||||
step_action = self._step_callback(t, env_spec_params, step_action) # include possible callback info
|
step_action = self._step_callback(t, env_spec_params, step_action) # include possible callback info
|
||||||
c_action = np.clip(step_action, self.env.action_space.low, self.env.action_space.high)
|
c_action = np.clip(step_action, self.env.action_space.low, self.env.action_space.high)
|
||||||
# print('step/clipped action ratio: ', step_action/c_action)
|
# print('step/clipped action ratio: ', step_action/c_action)
|
||||||
obs, c_reward, done, info = self.env.step(c_action)
|
obs, c_reward, done, info = self.env.step(c_action)
|
||||||
@ -197,7 +214,7 @@ class EpisodicWrapper(gym.Env, ABC):
|
|||||||
# infos['step_infos'].append(info)
|
# infos['step_infos'].append(info)
|
||||||
if self.render_mode:
|
if self.render_mode:
|
||||||
self.render(mode=self.render_mode, **self.render_kwargs)
|
self.render(mode=self.render_mode, **self.render_kwargs)
|
||||||
if done:
|
if done or do_replanning(kwargs):
|
||||||
break
|
break
|
||||||
infos.update({k: v[:t + 1] for k, v in infos.items()})
|
infos.update({k: v[:t + 1] for k, v in infos.items()})
|
||||||
if self.verbose >= 2:
|
if self.verbose >= 2:
|
||||||
@ -235,10 +252,10 @@ class EpisodicWrapper(gym.Env, ABC):
|
|||||||
for i in range(des_trajs.shape[1]):
|
for i in range(des_trajs.shape[1]):
|
||||||
plt.figure(pos_fig.number)
|
plt.figure(pos_fig.number)
|
||||||
plt.subplot(des_trajs.shape[1], 1, i + 1)
|
plt.subplot(des_trajs.shape[1], 1, i + 1)
|
||||||
plt.plot(np.ones(des_trajs.shape[0])*self.current_pos[i])
|
plt.plot(np.ones(des_trajs.shape[0]) * self.current_pos[i])
|
||||||
plt.plot(des_trajs[:, i])
|
plt.plot(des_trajs[:, i])
|
||||||
|
|
||||||
plt.figure(vel_fig.number)
|
plt.figure(vel_fig.number)
|
||||||
plt.subplot(des_vels.shape[1], 1, i + 1)
|
plt.subplot(des_vels.shape[1], 1, i + 1)
|
||||||
plt.plot(np.ones(des_trajs.shape[0])*self.current_vel[i])
|
plt.plot(np.ones(des_trajs.shape[0]) * self.current_vel[i])
|
||||||
plt.plot(des_vels[:, i])
|
plt.plot(des_vels[:, i])
|
||||||
|
@ -20,12 +20,13 @@ def make_dmc(
|
|||||||
environment_kwargs: dict = {},
|
environment_kwargs: dict = {},
|
||||||
time_limit: Union[None, float] = None,
|
time_limit: Union[None, float] = None,
|
||||||
channels_first: bool = True
|
channels_first: bool = True
|
||||||
):
|
):
|
||||||
# Adopted from: https://github.com/denisyarats/dmc2gym/blob/master/dmc2gym/__init__.py
|
# Adopted from: https://github.com/denisyarats/dmc2gym/blob/master/dmc2gym/__init__.py
|
||||||
# License: MIT
|
# License: MIT
|
||||||
# Copyright (c) 2020 Denis Yarats
|
# Copyright (c) 2020 Denis Yarats
|
||||||
|
|
||||||
assert re.match(r"\w+-\w+", id), "env_id does not have the following structure: 'domain_name-task_name'"
|
if not re.match(r"\w+-\w+", id):
|
||||||
|
raise ValueError("env_id does not have the following structure: 'domain_name-task_name'")
|
||||||
domain_name, task_name = id.split("-")
|
domain_name, task_name = id.split("-")
|
||||||
|
|
||||||
env_id = f'dmc_{domain_name}_{task_name}_{seed}-v1'
|
env_id = f'dmc_{domain_name}_{task_name}_{seed}-v1'
|
||||||
@ -60,7 +61,7 @@ def make_dmc(
|
|||||||
camera_id=camera_id,
|
camera_id=camera_id,
|
||||||
frame_skip=frame_skip,
|
frame_skip=frame_skip,
|
||||||
channels_first=channels_first,
|
channels_first=channels_first,
|
||||||
),
|
),
|
||||||
max_episode_steps=max_episode_steps,
|
max_episode_steps=max_episode_steps,
|
||||||
)
|
)
|
||||||
return gym.make(env_id)
|
return gym.make(env_id)
|
||||||
|
@ -55,7 +55,7 @@ def make(env_id: str, seed, **kwargs):
|
|||||||
Returns: Gym environment
|
Returns: Gym environment
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if any([det_pmp in env_id for det_pmp in ["DetPMP", "detpmp"]]):
|
if any(deprec in env_id for deprec in ["DetPMP", "detpmp"]):
|
||||||
warnings.warn("DetPMP is deprecated and converted to ProMP")
|
warnings.warn("DetPMP is deprecated and converted to ProMP")
|
||||||
env_id = env_id.replace("DetPMP", "ProMP")
|
env_id = env_id.replace("DetPMP", "ProMP")
|
||||||
env_id = env_id.replace("detpmp", "promp")
|
env_id = env_id.replace("detpmp", "promp")
|
||||||
@ -92,14 +92,17 @@ def make(env_id: str, seed, **kwargs):
|
|||||||
from alr_envs import make_dmc
|
from alr_envs import make_dmc
|
||||||
env = make_dmc(env_id, seed=seed, **kwargs)
|
env = make_dmc(env_id, seed=seed, **kwargs)
|
||||||
|
|
||||||
assert env.base_step_limit == env.spec.max_episode_steps, \
|
if not env.base_step_limit == env.spec.max_episode_steps:
|
||||||
f"The specified 'episode_length' of {env.spec.max_episode_steps} steps for gym is different from " \
|
raise ValueError(f"The specified 'episode_length' of {env.spec.max_episode_steps} steps for gym "
|
||||||
f"the DMC environment specification of {env.base_step_limit} steps."
|
f"is different from the DMC environment specification of {env.base_step_limit} steps.")
|
||||||
|
|
||||||
return env
|
return env
|
||||||
|
|
||||||
def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController,
|
|
||||||
ep_wrapper_kwargs: Mapping, seed=1, **kwargs):
|
def _make_wrapped_env(
|
||||||
|
env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController,
|
||||||
|
ep_wrapper_kwargs: Mapping, seed=1, **kwargs
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Helper function for creating a wrapped gym environment using MPs.
|
Helper function for creating a wrapped gym environment using MPs.
|
||||||
It adds all provided wrappers to the specified environment and verifies at least one MPEnvWrapper is
|
It adds all provided wrappers to the specified environment and verifies at least one MPEnvWrapper is
|
||||||
@ -120,13 +123,14 @@ def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MP
|
|||||||
# only wrap the environment if not EpisodicWrapper, e.g. for vision
|
# only wrap the environment if not EpisodicWrapper, e.g. for vision
|
||||||
if not issubclass(w, EpisodicWrapper):
|
if not issubclass(w, EpisodicWrapper):
|
||||||
_env = w(_env)
|
_env = w(_env)
|
||||||
else: # if EpisodicWrapper, use specific constructor
|
else: # if EpisodicWrapper, use specific constructor
|
||||||
has_episodic_wrapper = True
|
has_episodic_wrapper = True
|
||||||
_env = w(env=_env, mp=mp, controller=controller, **ep_wrapper_kwargs)
|
_env = w(env=_env, mp=mp, controller=controller, **ep_wrapper_kwargs)
|
||||||
assert has_episodic_wrapper, \
|
if not has_episodic_wrapper:
|
||||||
"At least one MPEnvWrapper is required in order to leverage motion primitive environments."
|
raise ValueError("An EpisodicWrapper is required in order to leverage movement primitive environments.")
|
||||||
return _env
|
return _env
|
||||||
|
|
||||||
|
|
||||||
def make_mp_from_kwargs(
|
def make_mp_from_kwargs(
|
||||||
env_id: str, wrappers: Iterable, ep_wrapper_kwargs: MutableMapping, mp_kwargs: MutableMapping,
|
env_id: str, wrappers: Iterable, ep_wrapper_kwargs: MutableMapping, mp_kwargs: MutableMapping,
|
||||||
controller_kwargs: MutableMapping, phase_kwargs: MutableMapping, basis_kwargs: MutableMapping, seed=1,
|
controller_kwargs: MutableMapping, phase_kwargs: MutableMapping, basis_kwargs: MutableMapping, seed=1,
|
||||||
@ -152,7 +156,7 @@ def make_mp_from_kwargs(
|
|||||||
_verify_time_limit(mp_kwargs.get("duration", None), kwargs.get("time_limit", None))
|
_verify_time_limit(mp_kwargs.get("duration", None), kwargs.get("time_limit", None))
|
||||||
dummy_env = make(env_id, seed)
|
dummy_env = make(env_id, seed)
|
||||||
if ep_wrapper_kwargs.get('duration', None) is None:
|
if ep_wrapper_kwargs.get('duration', None) is None:
|
||||||
ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps*dummy_env.dt
|
ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps * dummy_env.dt
|
||||||
if phase_kwargs.get('tau', None) is None:
|
if phase_kwargs.get('tau', None) is None:
|
||||||
phase_kwargs['tau'] = ep_wrapper_kwargs['duration']
|
phase_kwargs['tau'] = ep_wrapper_kwargs['duration']
|
||||||
mp_kwargs['action_dim'] = mp_kwargs.get('action_dim', np.prod(dummy_env.action_space.shape).item())
|
mp_kwargs['action_dim'] = mp_kwargs.get('action_dim', np.prod(dummy_env.action_space.shape).item())
|
||||||
@ -207,10 +211,11 @@ def make_mp_env_helper(**kwargs):
|
|||||||
phase_kwargs = kwargs.pop("phase_generator_kwargs")
|
phase_kwargs = kwargs.pop("phase_generator_kwargs")
|
||||||
basis_kwargs = kwargs.pop("basis_generator_kwargs")
|
basis_kwargs = kwargs.pop("basis_generator_kwargs")
|
||||||
|
|
||||||
return make_mp_from_kwargs(env_id=kwargs.pop("name"), wrappers=wrappers, ep_wrapper_kwargs=ep_wrapper_kwargs,
|
return make_mp_from_kwargs(env_id=kwargs.pop("name"), wrappers=wrappers, ep_wrapper_kwargs=ep_wrapper_kwargs,
|
||||||
mp_kwargs=mp_kwargs, controller_kwargs=contr_kwargs, phase_kwargs=phase_kwargs,
|
mp_kwargs=mp_kwargs, controller_kwargs=contr_kwargs, phase_kwargs=phase_kwargs,
|
||||||
basis_kwargs=basis_kwargs, **kwargs, seed=seed)
|
basis_kwargs=basis_kwargs, **kwargs, seed=seed)
|
||||||
|
|
||||||
|
|
||||||
def make_dmp_env(env_id: str, wrappers: Iterable, seed=1, mp_kwargs={}, **kwargs):
|
def make_dmp_env(env_id: str, wrappers: Iterable, seed=1, mp_kwargs={}, **kwargs):
|
||||||
"""
|
"""
|
||||||
This can also be used standalone for manually building a custom DMP environment.
|
This can also be used standalone for manually building a custom DMP environment.
|
||||||
|
Loading…
Reference in New Issue
Block a user