first clean up and some non working ideas sketched
This commit is contained in:
parent
7bd9848c31
commit
9ad6fbe712
38
README.md
38
README.md
@ -1,26 +1,32 @@
|
||||
## ALR Robotics Control Environments
|
||||
|
||||
This project offers a large variety of reinforcement learning environments under the unifying interface of [OpenAI gym](https://gym.openai.com/).
|
||||
Besides, we also provide support (under the OpenAI interface) for the benchmark suites
|
||||
We provide support (under the OpenAI interface) for the benchmark suites
|
||||
[DeepMind Control](https://deepmind.com/research/publications/2020/dm-control-Software-and-Tasks-for-Continuous-Control)
|
||||
(DMC) and [Metaworld](https://meta-world.github.io/). Custom (Mujoco) gym environments can be created according
|
||||
to [this guide](https://github.com/openai/gym/blob/master/docs/creating-environments.md). Unlike existing libraries, we
|
||||
additionally support to control agents with Dynamic Movement Primitives (DMPs) and Probabilistic Movement Primitives (ProMP,
|
||||
we only consider the mean usually).
|
||||
(DMC) and [Metaworld](https://meta-world.github.io/).
|
||||
Custom (Mujoco) gym environments can be created according
|
||||
to [this guide](https://www.gymlibrary.ml/content/environment_creation/).
|
||||
Unlike existing libraries, we additionally support to control agents with movement primitives, such as
|
||||
Dynamic Movement Primitives (DMPs) and Probabilistic Movement Primitives (ProMP, we only consider the mean usually).
|
||||
|
||||
## Motion Primitive Environments (Episodic environments)
|
||||
## Movement Primitive Environments (Episode-Based/Black-Box Environments)
|
||||
|
||||
Unlike step-based environments, motion primitive (MP) environments are closer related to stochastic search, black box
|
||||
optimization, and methods that are often used in robotics. MP environments are trajectory-based and always execute a full
|
||||
trajectory, which is generated by a Dynamic Motion Primitive (DMP) or a Probabilistic Motion Primitive (ProMP). The
|
||||
generated trajectory is translated into individual step-wise actions by a controller. The exact choice of controller is,
|
||||
however, dependent on the type of environment. We currently support position, velocity, and PD-Controllers for position,
|
||||
velocity, and torque control, respectively. The goal of all MP environments is still to learn a policy. Yet, an action
|
||||
Unlike step-based environments, movement primitive (MP) environments are closer related to stochastic search, black-box
|
||||
optimization, and methods that are often used in traditional robotics and control.
|
||||
MP environments are episode-based and always execute a full trajectory, which is generated by a trajectory generator,
|
||||
such as a Dynamic Movement Primitive (DMP) or a Probabilistic Movement Primitive (ProMP).
|
||||
The generated trajectory is translated into individual step-wise actions by a trajectory tracking controller.
|
||||
The exact choice of controller is, however, dependent on the type of environment.
|
||||
We currently support position, velocity, and PD-Controllers for position, velocity, and torque control, respectively
|
||||
as well as a special controller for the MetaWorld control suite.
|
||||
The goal of all MP environments is still to learn a optimal policy. Yet, an action
|
||||
represents the parametrization of the motion primitives to generate a suitable trajectory. Additionally, in this
|
||||
framework we support all of this also for the contextual setting, for which we expose all changing substates of the
|
||||
task as a single observation in the beginning. This requires to predict a new action/MP parametrization for each
|
||||
trajectory. All environments provide next to the cumulative episode reward all collected information from each
|
||||
step as part of the info dictionary. This information should, however, mainly be used for debugging and logging.
|
||||
framework we support all of this also for the contextual setting, i.e. we expose a subset of the observation space
|
||||
as a single context in the beginning of the episode. This requires to predict a new action/MP parametrization for each
|
||||
context.
|
||||
All environments provide next to the cumulative episode reward all collected information from each
|
||||
step as part of the info dictionary. This information is, however, mainly meant for debugging as well as logging
|
||||
and not for training.
|
||||
|
||||
|Key| Description|
|
||||
|---|---|
|
||||
|
@ -4,10 +4,11 @@ import numpy as np
|
||||
from gym.envs.mujoco import MujocoEnv
|
||||
|
||||
|
||||
|
||||
class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
|
||||
def __init__(self, n_substeps=4, apply_gravity_comp=True, simplified: bool = False,
|
||||
reward_type: str = None, context: np.ndarray = None):
|
||||
def __init__(
|
||||
self, n_substeps=4, apply_gravity_comp=True, simplified: bool = False,
|
||||
reward_type: str = None, context: np.ndarray = None
|
||||
):
|
||||
utils.EzPickle.__init__(**locals())
|
||||
self._steps = 0
|
||||
|
||||
@ -23,9 +24,7 @@ class ALRBallInACupEnv(MujocoEnv, utils.EzPickle):
|
||||
|
||||
self.context = context
|
||||
|
||||
alr_mujoco_env.AlrMujocoEnv.__init__(self,
|
||||
self.xml_path,
|
||||
apply_gravity_comp=apply_gravity_comp,
|
||||
alr_mujoco_env.AlrMujocoEnv.__init__(self, self.xml_path, apply_gravity_comp=apply_gravity_comp,
|
||||
n_substeps=n_substeps)
|
||||
self._start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
|
||||
self._start_vel = np.zeros(7)
|
||||
|
@ -67,7 +67,7 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
||||
action_cost = np.sum(np.square(action))
|
||||
self.action_costs.append(action_cost)
|
||||
|
||||
self._is_collided = self.check_collision(self.env.sim) or self.env.check_traj_in_joint_limits()
|
||||
self._is_collided = self.check_collision(self.env.sim) or self.env._check_traj_in_joint_limits()
|
||||
|
||||
if self.env._steps == self.env.ep_length - 1 or self._is_collided:
|
||||
t_min_dist = np.argmin(self.dists)
|
||||
|
@ -1,11 +1,11 @@
|
||||
import mujoco_py.builder
|
||||
import os
|
||||
|
||||
import mujoco_py.builder
|
||||
import numpy as np
|
||||
from gym import utils, spaces
|
||||
from gym import utils
|
||||
from gym.envs.mujoco import MujocoEnv
|
||||
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
|
||||
|
||||
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
|
||||
|
||||
CUP_POS_MIN = np.array([-1.42, -4.05])
|
||||
CUP_POS_MAX = np.array([1.42, -1.25])
|
||||
@ -18,12 +18,14 @@ CUP_POS_MAX = np.array([1.42, -1.25])
|
||||
# CUP_POS_MIN = np.array([-0.16, -2.2])
|
||||
# CUP_POS_MAX = np.array([0.16, -1.7])
|
||||
|
||||
|
||||
class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False,
|
||||
rndm_goal=False, cup_goal_pos=None):
|
||||
def __init__(
|
||||
self, frame_skip=1, apply_gravity_comp=True, noisy=False,
|
||||
rndm_goal=False, cup_goal_pos=None
|
||||
):
|
||||
|
||||
cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840])
|
||||
if cup_goal_pos.shape[0]==2:
|
||||
if cup_goal_pos.shape[0] == 2:
|
||||
cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840)
|
||||
self.cup_goal_pos = np.array(cup_goal_pos)
|
||||
|
||||
@ -50,7 +52,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||
# self._release_step = 130 # time step of ball release
|
||||
self.release_step = 100 # time step of ball release
|
||||
|
||||
self.ep_length = 600//frame_skip
|
||||
self.ep_length = 600 // frame_skip
|
||||
self.cup_table_id = 10
|
||||
|
||||
if noisy:
|
||||
@ -71,14 +73,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||
def start_vel(self):
|
||||
return self._start_vel
|
||||
|
||||
@property
|
||||
def current_pos(self):
|
||||
return self.sim.data.qpos[0:7].copy()
|
||||
|
||||
@property
|
||||
def current_vel(self):
|
||||
return self.sim.data.qvel[0:7].copy()
|
||||
|
||||
def reset(self):
|
||||
self.reward_function.reset(self.add_noise)
|
||||
return super().reset()
|
||||
@ -147,7 +141,8 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||
ball_pos = np.zeros(3)
|
||||
ball_vel = np.zeros(3)
|
||||
|
||||
infos = dict(reward_dist=reward_dist,
|
||||
infos = dict(
|
||||
reward_dist=reward_dist,
|
||||
reward=reward,
|
||||
velocity=angular_vel,
|
||||
# traj=self._q_pos,
|
||||
@ -158,18 +153,20 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||
ball_vel=ball_vel,
|
||||
success=success,
|
||||
is_collided=is_collided, sim_crash=crash,
|
||||
table_contact_first=int(not self.reward_function.ball_ground_contact_first))
|
||||
table_contact_first=int(not self.reward_function.ball_ground_contact_first)
|
||||
)
|
||||
infos.update(reward_infos)
|
||||
return ob, reward, done, infos
|
||||
|
||||
def check_traj_in_joint_limits(self):
|
||||
def _check_traj_in_joint_limits(self):
|
||||
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
|
||||
|
||||
def _get_obs(self):
|
||||
theta = self.sim.data.qpos.flat[:7]
|
||||
theta_dot = self.sim.data.qvel.flat[:7]
|
||||
ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
|
||||
cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_final_table"]].copy()
|
||||
cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[
|
||||
self.sim.model._site_name2id["cup_goal_final_table"]].copy()
|
||||
cup_goal_diff_top = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_table"]].copy()
|
||||
return np.concatenate([
|
||||
np.cos(theta),
|
||||
@ -181,15 +178,19 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||
[self._steps],
|
||||
])
|
||||
|
||||
def compute_reward(self):
|
||||
|
||||
@property
|
||||
def dt(self):
|
||||
return super(ALRBeerBongEnv, self).dt*self.repeat_action
|
||||
return super(ALRBeerBongEnv, self).dt * self.repeat_action
|
||||
|
||||
|
||||
class ALRBeerBongEnvFixedReleaseStep(ALRBeerBongEnv):
|
||||
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
||||
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
||||
self.release_step = 62 # empirically evaluated for frame_skip=2!
|
||||
|
||||
|
||||
class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
|
||||
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
||||
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
||||
@ -202,7 +203,8 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
|
||||
reward = 0
|
||||
done = False
|
||||
while not done:
|
||||
sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape))
|
||||
sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(
|
||||
np.zeros(a.shape))
|
||||
reward += sub_reward
|
||||
infos = sub_infos
|
||||
ob = sub_ob
|
||||
@ -211,45 +213,11 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
|
||||
return ob, reward, done, infos
|
||||
|
||||
|
||||
# class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
|
||||
# def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
||||
# super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
||||
# self.release_step = 62 # empirically evaluated for frame_skip=2!
|
||||
#
|
||||
# def step(self, a):
|
||||
# if self._steps < self.release_step:
|
||||
# return super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(a)
|
||||
# else:
|
||||
# sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape))
|
||||
# reward = sub_reward
|
||||
# infos = sub_infos
|
||||
# ob = sub_ob
|
||||
# ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
|
||||
# # internal steps and thus, the observation also needs to be set correctly
|
||||
# return ob, reward, done, infos
|
||||
|
||||
|
||||
class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
|
||||
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
||||
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
||||
self.release_step = 62 # empirically evaluated for frame_skip=2!
|
||||
|
||||
# def _set_action_space(self):
|
||||
# bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space()
|
||||
# min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype)
|
||||
# max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype)
|
||||
# self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype)
|
||||
# return self.action_space
|
||||
|
||||
# def step(self, a):
|
||||
# self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step
|
||||
# return super(ALRBeerBongEnvStepBased, self).step(a[1:])
|
||||
#
|
||||
# def reset(self):
|
||||
# ob = super(ALRBeerBongEnvStepBased, self).reset()
|
||||
# self.release_step = self.ep_length + 1
|
||||
# return ob
|
||||
|
||||
def step(self, a):
|
||||
if self._steps < self.release_step:
|
||||
return super(ALRBeerBongEnvStepBased, self).step(a)
|
||||
@ -267,9 +235,9 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
|
||||
cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
|
||||
self.sim.model._site_name2id["cup_goal_table"]].copy())
|
||||
if sub_infos['success']:
|
||||
dist_rew = -cup_goal_dist_final**2
|
||||
dist_rew = -cup_goal_dist_final ** 2
|
||||
else:
|
||||
dist_rew = -0.5*cup_goal_dist_final**2 - cup_goal_dist_top**2
|
||||
dist_rew = -0.5 * cup_goal_dist_final ** 2 - cup_goal_dist_top ** 2
|
||||
reward = reward - sub_infos['action_cost'] + dist_rew
|
||||
infos = sub_infos
|
||||
ob = sub_ob
|
||||
@ -278,13 +246,13 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
|
||||
return ob, reward, done, infos
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# env = ALRBeerBongEnv(rndm_goal=True)
|
||||
# env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True)
|
||||
# env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True)
|
||||
env = ALRBeerBongEnvFixedReleaseStep(frame_skip=2, rndm_goal=True)
|
||||
import time
|
||||
|
||||
env.reset()
|
||||
env.render("human")
|
||||
for i in range(1500):
|
||||
|
@ -29,52 +29,38 @@ class BeerPongReward:
|
||||
# "cup_base_table"
|
||||
]
|
||||
|
||||
|
||||
self.ball_traj = None
|
||||
self.dists = None
|
||||
self.dists_final = None
|
||||
self.costs = None
|
||||
self.action_costs = None
|
||||
self.angle_rewards = None
|
||||
self.cup_angles = None
|
||||
self.cup_z_axes = None
|
||||
self.collision_penalty = 500
|
||||
self.reset(None)
|
||||
self.reset()
|
||||
self.is_initialized = False
|
||||
|
||||
def reset(self, noisy):
|
||||
self.ball_traj = []
|
||||
def reset(self):
|
||||
self.dists = []
|
||||
self.dists_final = []
|
||||
self.costs = []
|
||||
self.action_costs = []
|
||||
self.angle_rewards = []
|
||||
self.cup_angles = []
|
||||
self.cup_z_axes = []
|
||||
self.ball_ground_contact_first = False
|
||||
self.ball_table_contact = False
|
||||
self.ball_wall_contact = False
|
||||
self.ball_cup_contact = False
|
||||
self.ball_in_cup = False
|
||||
self.dist_ground_cup = -1 # distance floor to cup if first floor contact
|
||||
self.noisy_bp = noisy
|
||||
self._t_min_final_dist = -1
|
||||
|
||||
def initialize(self, env):
|
||||
|
||||
if not self.is_initialized:
|
||||
self.is_initialized = True
|
||||
self.ball_id = env.sim.model._body_name2id["ball"]
|
||||
self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"]
|
||||
self.goal_id = env.sim.model._site_name2id["cup_goal_table"]
|
||||
self.goal_final_id = env.sim.model._site_name2id["cup_goal_final_table"]
|
||||
self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects]
|
||||
self.cup_table_id = env.sim.model._body_name2id["cup_table"]
|
||||
|
||||
self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"]
|
||||
self.table_collision_id = env.sim.model._geom_name2id["table_contact_geom"]
|
||||
self.wall_collision_id = env.sim.model._geom_name2id["wall"]
|
||||
self.cup_table_collision_id = env.sim.model._geom_name2id["cup_base_table_contact"]
|
||||
self.init_ball_pos_site_id = env.sim.model._site_name2id["init_ball_pos_site"]
|
||||
self.ground_collision_id = env.sim.model._geom_name2id["ground"]
|
||||
self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects]
|
||||
self.robot_collision_ids = [env.sim.model._geom_name2id[name] for name in self.robot_collision_objects]
|
||||
|
||||
def compute_reward(self, env, action):
|
||||
@ -87,7 +73,7 @@ class BeerPongReward:
|
||||
self.check_contacts(env.sim)
|
||||
self.dists.append(np.linalg.norm(goal_pos - ball_pos))
|
||||
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
|
||||
self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \
|
||||
self.dist_ground_cup = np.linalg.norm(ball_pos - goal_pos) \
|
||||
if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup
|
||||
action_cost = np.sum(np.square(action))
|
||||
self.action_costs.append(np.copy(action_cost))
|
||||
@ -108,18 +94,18 @@ class BeerPongReward:
|
||||
else:
|
||||
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -2
|
||||
else:
|
||||
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0
|
||||
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0, 0
|
||||
# dist_ground_cup = 1 * self.dist_ground_cup
|
||||
action_cost = 1e-4 * np.mean(action_cost)
|
||||
reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \
|
||||
action_cost - ground_contact_dist_coeff*self.dist_ground_cup ** 2
|
||||
action_cost - ground_contact_dist_coeff * self.dist_ground_cup ** 2
|
||||
# 1e-7*np.mean(action_cost)
|
||||
# release step punishment
|
||||
min_time_bound = 0.1
|
||||
max_time_bound = 1.0
|
||||
release_time = env.release_step*env.dt
|
||||
release_time_rew = int(release_time<min_time_bound)*(-30-10*(release_time-min_time_bound)**2) \
|
||||
+int(release_time>max_time_bound)*(-30-10*(release_time-max_time_bound)**2)
|
||||
release_time = env.release_step * env.dt
|
||||
release_time_rew = int(release_time < min_time_bound) * (-30 - 10 * (release_time - min_time_bound) ** 2) \
|
||||
+ int(release_time > max_time_bound) * (-30 - 10 * (release_time - max_time_bound) ** 2)
|
||||
reward += release_time_rew
|
||||
success = self.ball_in_cup
|
||||
# print('release time :', release_time)
|
||||
|
@ -1,10 +1,15 @@
|
||||
from alr_envs.mp.episodic_wrapper import EpisodicWrapper
|
||||
from typing import Union, Tuple
|
||||
from typing import Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import gym
|
||||
|
||||
from alr_envs.mp.episodic_wrapper import EpisodicWrapper
|
||||
|
||||
|
||||
class NewMPWrapper(EpisodicWrapper):
|
||||
|
||||
# def __init__(self, replanning_model):
|
||||
# self.replanning_model = replanning_model
|
||||
|
||||
@property
|
||||
def current_pos(self) -> Union[float, int, np.ndarray, Tuple]:
|
||||
return self.env.sim.data.qpos[0:7].copy()
|
||||
@ -24,24 +29,14 @@ class NewMPWrapper(EpisodicWrapper):
|
||||
[False] # env steps
|
||||
])
|
||||
|
||||
# def set_mp_action_space(self):
|
||||
# min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
|
||||
# if self.mp.learn_tau:
|
||||
# min_action_bounds[0] = 20*self.env.dt
|
||||
# max_action_bounds[0] = 260*self.env.dt
|
||||
# mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(),
|
||||
# dtype=np.float32)
|
||||
# return mp_action_space
|
||||
|
||||
# def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]:
|
||||
# if self.mp.learn_tau:
|
||||
# return np.concatenate((step_action, np.atleast_1d(env_spec_params)))
|
||||
# else:
|
||||
# return step_action
|
||||
def do_replanning(self, pos, vel, s, a, t, last_replan_step):
|
||||
return False
|
||||
# const = np.arange(0, 1000, 10)
|
||||
# return bool(self.replanning_model(s))
|
||||
|
||||
def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
|
||||
if self.mp.learn_tau:
|
||||
self.env.env.release_step = action[0]/self.env.dt # Tau value
|
||||
self.env.env.release_step = action[0] / self.env.dt # Tau value
|
||||
return action, None
|
||||
else:
|
||||
return action, None
|
||||
@ -52,12 +47,3 @@ class NewMPWrapper(EpisodicWrapper):
|
||||
xyz[-1] = 0.840
|
||||
self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz
|
||||
return self.get_observation_from_step(self.env.env._get_obs())
|
||||
# def set_action_space(self):
|
||||
# if self.mp.learn_tau:
|
||||
# min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
|
||||
# min_action_bounds = np.concatenate((min_action_bounds.numpy(), [self.env.action_space.low[-1]]))
|
||||
# max_action_bounds = np.concatenate((max_action_bounds.numpy(), [self.env.action_space.high[-1]]))
|
||||
# self.action_space = gym.spaces.Box(low=min_action_bounds, high=max_action_bounds, dtype=np.float32)
|
||||
# return self.action_space
|
||||
# else:
|
||||
# return super(NewMPWrapper, self).set_action_space()
|
||||
|
@ -14,7 +14,8 @@ class ALRHopperJumpEnv(HopperEnv):
|
||||
- exclude current positions from observatiosn is set to False
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
def __init__(
|
||||
self,
|
||||
xml_file='hopper_jump.xml',
|
||||
forward_reward_weight=1.0,
|
||||
ctrl_cost_weight=1e-3,
|
||||
@ -27,7 +28,9 @@ class ALRHopperJumpEnv(HopperEnv):
|
||||
healthy_angle_range=(-float('inf'), float('inf')),
|
||||
reset_noise_scale=5e-3,
|
||||
exclude_current_positions_from_observation=False,
|
||||
max_episode_steps=250):
|
||||
max_episode_steps=250
|
||||
):
|
||||
|
||||
self.current_step = 0
|
||||
self.max_height = 0
|
||||
self.max_episode_steps = max_episode_steps
|
||||
@ -59,22 +62,14 @@ class ALRHopperJumpEnv(HopperEnv):
|
||||
done = False
|
||||
rewards = 0
|
||||
if self.current_step >= self.max_episode_steps:
|
||||
hight_goal_distance = -10*np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height
|
||||
hight_goal_distance = -10 * np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height
|
||||
healthy_reward = 0 if self.context else self.healthy_reward * 2 # self.current_step
|
||||
height_reward = self._forward_reward_weight * hight_goal_distance # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley
|
||||
rewards = height_reward + healthy_reward
|
||||
|
||||
# else:
|
||||
# # penalty for wrong start direction of first two joints; not needed, could be removed
|
||||
# rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0
|
||||
|
||||
observation = self._get_obs()
|
||||
reward = rewards - costs
|
||||
# info = {
|
||||
# 'height' : height_after,
|
||||
# 'max_height': self.max_height,
|
||||
# 'goal' : self.goal
|
||||
# }
|
||||
|
||||
info = {
|
||||
'height': height_after,
|
||||
'x_pos': site_pos_after,
|
||||
@ -157,9 +152,9 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
|
||||
rewards = 0
|
||||
if self.current_step >= self.max_episode_steps:
|
||||
# healthy_reward = 0 if self.context else self.healthy_reward * self.current_step
|
||||
healthy_reward = self.healthy_reward * 2 #* self.current_step
|
||||
healthy_reward = self.healthy_reward * 2 # * self.current_step
|
||||
contact_dist = self.contact_dist if self.contact_dist is not None else 5
|
||||
dist_reward = self._forward_reward_weight * (-3*goal_dist + 10*self.max_height - 2*contact_dist)
|
||||
dist_reward = self._forward_reward_weight * (-3 * goal_dist + 10 * self.max_height - 2 * contact_dist)
|
||||
rewards = dist_reward + healthy_reward
|
||||
|
||||
observation = self._get_obs()
|
||||
@ -225,9 +220,11 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
|
||||
self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0])
|
||||
return self._get_obs()
|
||||
|
||||
|
||||
class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
|
||||
|
||||
def __init__(self,
|
||||
def __init__(
|
||||
self,
|
||||
xml_file='hopper_jump.xml',
|
||||
forward_reward_weight=1.0,
|
||||
ctrl_cost_weight=1e-3,
|
||||
@ -241,9 +238,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
|
||||
reset_noise_scale=5e-3,
|
||||
exclude_current_positions_from_observation=False,
|
||||
max_episode_steps=250,
|
||||
height_scale = 10,
|
||||
dist_scale = 3,
|
||||
healthy_scale = 2):
|
||||
height_scale=10,
|
||||
dist_scale=3,
|
||||
healthy_scale=2
|
||||
):
|
||||
self.height_scale = height_scale
|
||||
self.dist_scale = dist_scale
|
||||
self.healthy_scale = healthy_scale
|
||||
@ -263,15 +261,14 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
|
||||
ctrl_cost = self.control_cost(action)
|
||||
|
||||
healthy_reward = self.healthy_reward * self.healthy_scale
|
||||
height_reward = self.height_scale*height_after
|
||||
height_reward = self.height_scale * height_after
|
||||
goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0]
|
||||
goal_dist_reward = -self.dist_scale*goal_dist
|
||||
goal_dist_reward = -self.dist_scale * goal_dist
|
||||
dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward)
|
||||
reward = -ctrl_cost + healthy_reward + dist_reward
|
||||
done = False
|
||||
observation = self._get_obs()
|
||||
|
||||
|
||||
###########################################################
|
||||
# This is only for logging the distance to goal when first having the contact
|
||||
##########################################################
|
||||
@ -300,6 +297,7 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
|
||||
}
|
||||
return observation, reward, done, info
|
||||
|
||||
|
||||
class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
|
||||
def __init__(self, max_episode_steps=250):
|
||||
super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False,
|
||||
@ -309,14 +307,14 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
|
||||
def reset_model(self):
|
||||
self._floor_geom_id = self.model.geom_name2id('floor')
|
||||
self._foot_geom_id = self.model.geom_name2id('foot_geom')
|
||||
noise_low = -np.ones(self.model.nq)*self._reset_noise_scale
|
||||
noise_low = -np.ones(self.model.nq) * self._reset_noise_scale
|
||||
noise_low[1] = 0
|
||||
noise_low[2] = 0
|
||||
noise_low[3] = -0.2
|
||||
noise_low[4] = -0.2
|
||||
noise_low[5] = -0.1
|
||||
|
||||
noise_high = np.ones(self.model.nq)*self._reset_noise_scale
|
||||
noise_high = np.ones(self.model.nq) * self._reset_noise_scale
|
||||
noise_high[1] = 0
|
||||
noise_high[2] = 0
|
||||
noise_high[3] = 0
|
||||
@ -370,7 +368,6 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
|
||||
return observation, reward, done, info
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
render_mode = "human" # "human" or "partial" or "final"
|
||||
# env = ALRHopperJumpEnv()
|
||||
|
@ -9,7 +9,7 @@ from mp_pytorch.mp.mp_interfaces import MPInterface
|
||||
from alr_envs.mp.controllers.base_controller import BaseController
|
||||
|
||||
|
||||
class EpisodicWrapper(gym.Env, ABC):
|
||||
class EpisodicWrapper(gym.Env, gym.wrappers.TransformReward, ABC):
|
||||
"""
|
||||
Base class for movement primitive based gym.Wrapper implementations.
|
||||
|
||||
@ -22,14 +22,19 @@ class EpisodicWrapper(gym.Env, ABC):
|
||||
weight_scale: Scaling parameter for the actions given to this wrapper
|
||||
render_mode: Equivalent to gym render mode
|
||||
"""
|
||||
def __init__(self,
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
env: gym.Env,
|
||||
mp: MPInterface,
|
||||
controller: BaseController,
|
||||
duration: float,
|
||||
render_mode: str = None,
|
||||
verbose: int = 1,
|
||||
weight_scale: float = 1):
|
||||
weight_scale: float = 1,
|
||||
sequencing=True,
|
||||
reward_aggregation=np.mean,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.env = env
|
||||
@ -40,11 +45,11 @@ class EpisodicWrapper(gym.Env, ABC):
|
||||
self.duration = duration
|
||||
self.traj_steps = int(duration / self.dt)
|
||||
self.post_traj_steps = self.env.spec.max_episode_steps - self.traj_steps
|
||||
# duration = self.env.max_episode_steps * self.dt
|
||||
|
||||
self.controller = controller
|
||||
self.mp = mp
|
||||
self.env = env
|
||||
self.verbose = verbose
|
||||
self.controller = controller
|
||||
self.weight_scale = weight_scale
|
||||
|
||||
# rendering
|
||||
@ -52,15 +57,18 @@ class EpisodicWrapper(gym.Env, ABC):
|
||||
self.render_kwargs = {}
|
||||
self.time_steps = np.linspace(0, self.duration, self.traj_steps)
|
||||
self.mp.set_mp_times(self.time_steps)
|
||||
# self.mp.set_mp_duration(self.time_steps, dt)
|
||||
# action_bounds = np.inf * np.ones((np.prod(self.mp.num_params)))
|
||||
self.mp_action_space = self.set_mp_action_space()
|
||||
self.mp_action_space = self.get_mp_action_space()
|
||||
|
||||
self.action_space = self.set_action_space()
|
||||
self.action_space = self.get_action_space()
|
||||
self.active_obs = self.set_active_obs()
|
||||
self.observation_space = spaces.Box(low=self.env.observation_space.low[self.active_obs],
|
||||
high=self.env.observation_space.high[self.active_obs],
|
||||
dtype=self.env.observation_space.dtype)
|
||||
|
||||
self.verbose = verbose
|
||||
|
||||
def get_trajectory(self, action: np.ndarray) -> Tuple:
|
||||
# TODO: this follows the implementation of the mp_pytorch library which includes the parameters tau and delay at
|
||||
# the beginning of the array.
|
||||
@ -69,33 +77,37 @@ class EpisodicWrapper(gym.Env, ABC):
|
||||
scaled_mp_params[ignore_indices:] *= self.weight_scale
|
||||
self.mp.set_params(np.clip(scaled_mp_params, self.mp_action_space.low, self.mp_action_space.high))
|
||||
self.mp.set_boundary_conditions(bc_time=self.time_steps[:1], bc_pos=self.current_pos, bc_vel=self.current_vel)
|
||||
traj_dict = self.mp.get_mp_trajs(get_pos = True, get_vel = True)
|
||||
traj_dict = self.mp.get_mp_trajs(get_pos=True, get_vel=True)
|
||||
trajectory_tensor, velocity_tensor = traj_dict['pos'], traj_dict['vel']
|
||||
|
||||
trajectory = trajectory_tensor.numpy()
|
||||
velocity = velocity_tensor.numpy()
|
||||
|
||||
# TODO: Do we need this or does mp_pytorch have this?
|
||||
if self.post_traj_steps > 0:
|
||||
trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
|
||||
velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dof))])
|
||||
|
||||
return trajectory, velocity
|
||||
|
||||
def set_mp_action_space(self):
|
||||
def get_mp_action_space(self):
|
||||
"""This function can be used to set up an individual space for the parameters of the mp."""
|
||||
min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
|
||||
mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(),
|
||||
dtype=np.float32)
|
||||
return mp_action_space
|
||||
|
||||
def set_action_space(self):
|
||||
def get_action_space(self):
|
||||
"""
|
||||
This function can be used to modify the action space for considering actions which are not learned via motion
|
||||
primitives. E.g. ball releasing time for the beer pong task. By default, it is the parameter space of the
|
||||
motion primitive.
|
||||
Only needs to be overwritten if the action space needs to be modified.
|
||||
"""
|
||||
try:
|
||||
return self.mp_action_space
|
||||
except AttributeError:
|
||||
return self.get_mp_action_space()
|
||||
|
||||
def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]:
|
||||
"""
|
||||
@ -111,7 +123,8 @@ class EpisodicWrapper(gym.Env, ABC):
|
||||
"""
|
||||
return action, None
|
||||
|
||||
def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]:
|
||||
def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[
|
||||
np.ndarray]:
|
||||
"""
|
||||
This function can be used to modify the step_action with additional parameters e.g. releasing the ball in the
|
||||
Beerpong env. The parameters used should not be part of the motion primitive parameters.
|
||||
@ -169,8 +182,12 @@ class EpisodicWrapper(gym.Env, ABC):
|
||||
mp_params, env_spec_params = self._episode_callback(action)
|
||||
trajectory, velocity = self.get_trajectory(mp_params)
|
||||
|
||||
# TODO
|
||||
# self.time_steps = np.linspace(0, learned_duration, self.traj_steps)
|
||||
# self.mp.set_mp_times(self.time_steps)
|
||||
|
||||
trajectory_length = len(trajectory)
|
||||
if self.verbose >=2 :
|
||||
if self.verbose >= 2:
|
||||
actions = np.zeros(shape=(trajectory_length,) + self.env.action_space.shape)
|
||||
observations = np.zeros(shape=(trajectory_length,) + self.env.observation_space.shape,
|
||||
dtype=self.env.observation_space.dtype)
|
||||
@ -197,7 +214,7 @@ class EpisodicWrapper(gym.Env, ABC):
|
||||
# infos['step_infos'].append(info)
|
||||
if self.render_mode:
|
||||
self.render(mode=self.render_mode, **self.render_kwargs)
|
||||
if done:
|
||||
if done or do_replanning(kwargs):
|
||||
break
|
||||
infos.update({k: v[:t + 1] for k, v in infos.items()})
|
||||
if self.verbose >= 2:
|
||||
@ -235,10 +252,10 @@ class EpisodicWrapper(gym.Env, ABC):
|
||||
for i in range(des_trajs.shape[1]):
|
||||
plt.figure(pos_fig.number)
|
||||
plt.subplot(des_trajs.shape[1], 1, i + 1)
|
||||
plt.plot(np.ones(des_trajs.shape[0])*self.current_pos[i])
|
||||
plt.plot(np.ones(des_trajs.shape[0]) * self.current_pos[i])
|
||||
plt.plot(des_trajs[:, i])
|
||||
|
||||
plt.figure(vel_fig.number)
|
||||
plt.subplot(des_vels.shape[1], 1, i + 1)
|
||||
plt.plot(np.ones(des_trajs.shape[0])*self.current_vel[i])
|
||||
plt.plot(np.ones(des_trajs.shape[0]) * self.current_vel[i])
|
||||
plt.plot(des_vels[:, i])
|
||||
|
@ -20,12 +20,13 @@ def make_dmc(
|
||||
environment_kwargs: dict = {},
|
||||
time_limit: Union[None, float] = None,
|
||||
channels_first: bool = True
|
||||
):
|
||||
):
|
||||
# Adopted from: https://github.com/denisyarats/dmc2gym/blob/master/dmc2gym/__init__.py
|
||||
# License: MIT
|
||||
# Copyright (c) 2020 Denis Yarats
|
||||
|
||||
assert re.match(r"\w+-\w+", id), "env_id does not have the following structure: 'domain_name-task_name'"
|
||||
if not re.match(r"\w+-\w+", id):
|
||||
raise ValueError("env_id does not have the following structure: 'domain_name-task_name'")
|
||||
domain_name, task_name = id.split("-")
|
||||
|
||||
env_id = f'dmc_{domain_name}_{task_name}_{seed}-v1'
|
||||
|
@ -55,7 +55,7 @@ def make(env_id: str, seed, **kwargs):
|
||||
Returns: Gym environment
|
||||
|
||||
"""
|
||||
if any([det_pmp in env_id for det_pmp in ["DetPMP", "detpmp"]]):
|
||||
if any(deprec in env_id for deprec in ["DetPMP", "detpmp"]):
|
||||
warnings.warn("DetPMP is deprecated and converted to ProMP")
|
||||
env_id = env_id.replace("DetPMP", "ProMP")
|
||||
env_id = env_id.replace("detpmp", "promp")
|
||||
@ -92,14 +92,17 @@ def make(env_id: str, seed, **kwargs):
|
||||
from alr_envs import make_dmc
|
||||
env = make_dmc(env_id, seed=seed, **kwargs)
|
||||
|
||||
assert env.base_step_limit == env.spec.max_episode_steps, \
|
||||
f"The specified 'episode_length' of {env.spec.max_episode_steps} steps for gym is different from " \
|
||||
f"the DMC environment specification of {env.base_step_limit} steps."
|
||||
if not env.base_step_limit == env.spec.max_episode_steps:
|
||||
raise ValueError(f"The specified 'episode_length' of {env.spec.max_episode_steps} steps for gym "
|
||||
f"is different from the DMC environment specification of {env.base_step_limit} steps.")
|
||||
|
||||
return env
|
||||
|
||||
def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController,
|
||||
ep_wrapper_kwargs: Mapping, seed=1, **kwargs):
|
||||
|
||||
def _make_wrapped_env(
|
||||
env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController,
|
||||
ep_wrapper_kwargs: Mapping, seed=1, **kwargs
|
||||
):
|
||||
"""
|
||||
Helper function for creating a wrapped gym environment using MPs.
|
||||
It adds all provided wrappers to the specified environment and verifies at least one MPEnvWrapper is
|
||||
@ -123,10 +126,11 @@ def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MP
|
||||
else: # if EpisodicWrapper, use specific constructor
|
||||
has_episodic_wrapper = True
|
||||
_env = w(env=_env, mp=mp, controller=controller, **ep_wrapper_kwargs)
|
||||
assert has_episodic_wrapper, \
|
||||
"At least one MPEnvWrapper is required in order to leverage motion primitive environments."
|
||||
if not has_episodic_wrapper:
|
||||
raise ValueError("An EpisodicWrapper is required in order to leverage movement primitive environments.")
|
||||
return _env
|
||||
|
||||
|
||||
def make_mp_from_kwargs(
|
||||
env_id: str, wrappers: Iterable, ep_wrapper_kwargs: MutableMapping, mp_kwargs: MutableMapping,
|
||||
controller_kwargs: MutableMapping, phase_kwargs: MutableMapping, basis_kwargs: MutableMapping, seed=1,
|
||||
@ -152,7 +156,7 @@ def make_mp_from_kwargs(
|
||||
_verify_time_limit(mp_kwargs.get("duration", None), kwargs.get("time_limit", None))
|
||||
dummy_env = make(env_id, seed)
|
||||
if ep_wrapper_kwargs.get('duration', None) is None:
|
||||
ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps*dummy_env.dt
|
||||
ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps * dummy_env.dt
|
||||
if phase_kwargs.get('tau', None) is None:
|
||||
phase_kwargs['tau'] = ep_wrapper_kwargs['duration']
|
||||
mp_kwargs['action_dim'] = mp_kwargs.get('action_dim', np.prod(dummy_env.action_space.shape).item())
|
||||
@ -211,6 +215,7 @@ def make_mp_env_helper(**kwargs):
|
||||
mp_kwargs=mp_kwargs, controller_kwargs=contr_kwargs, phase_kwargs=phase_kwargs,
|
||||
basis_kwargs=basis_kwargs, **kwargs, seed=seed)
|
||||
|
||||
|
||||
def make_dmp_env(env_id: str, wrappers: Iterable, seed=1, mp_kwargs={}, **kwargs):
|
||||
"""
|
||||
This can also be used standalone for manually building a custom DMP environment.
|
||||
|
Loading…
Reference in New Issue
Block a user