From f5f12c846fed80634bfdf7aa5fbed720914a33a4 Mon Sep 17 00:00:00 2001 From: Maximilian Huettenrauch Date: Fri, 21 May 2021 15:44:49 +0200 Subject: [PATCH 1/2] updates on mp wrappers and some bugfixes --- alr_envs/__init__.py | 54 ++++++++++++++++--- alr_envs/classic_control/hole_reacher.py | 6 +-- alr_envs/classic_control/simple_reacher.py | 4 +- alr_envs/classic_control/viapoint_reacher.py | 4 +- alr_envs/mujoco/alr_mujoco_env.py | 17 +++--- .../mujoco/ball_in_a_cup/ball_in_a_cup.py | 17 +++--- alr_envs/mujoco/beerpong/beerpong.py | 5 ++ alr_envs/utils/mp_env_async_sampler.py | 14 ++++- alr_envs/utils/mps/detpmp_wrapper.py | 15 +++--- alr_envs/utils/mps/dmp_wrapper.py | 10 ++-- alr_envs/utils/mps/mp_environments.py | 2 +- alr_envs/utils/mps/mp_wrapper.py | 11 ++-- example.py | 40 +++++++++++--- 13 files changed, 145 insertions(+), 54 deletions(-) diff --git a/alr_envs/__init__.py b/alr_envs/__init__.py index fa73b7a..2a8054c 100644 --- a/alr_envs/__init__.py +++ b/alr_envs/__init__.py @@ -272,12 +272,20 @@ for v in versions: } ) -# register( -# id='HoleReacherDetPMP-v0', -# entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp', -# # max_episode_steps=1, -# # TODO: add mp kwargs -# ) + register( + id=f'HoleReacherDetPMP-{v}', + entry_point='alr_envs.utils.make_env_helpers:make_detpmp_env', + kwargs={ + "name": f"alr_envs:HoleReacher-{v}", + "num_dof": 5, + "num_basis": 5, + "duration": 2, + "width": 0.01, + "policy_type": "velocity", + "weights_scale": 0.2, + "zero_start": True + } + ) # TODO: properly add final_pos register( @@ -335,6 +343,40 @@ register( } ) +register( + id='ALRBallInACupSimpleDetPMP-v0', + entry_point='alr_envs.utils.make_env_helpers:make_detpmp_env', + kwargs={ + "name": "alr_envs:ALRBallInACupSimple-v0", + "num_dof": 3, + "num_basis": 5, + "duration": 3.5, + "post_traj_time": 4.5, + "width": 0.005, + "policy_type": "motor", + "weights_scale": 0.2, + "zero_start": True, + "zero_goal": True + } +) + +register( + id='ALRBallInACupDetPMP-v0', + entry_point='alr_envs.utils.make_env_helpers:make_detpmp_env', + kwargs={ + "name": "alr_envs:ALRBallInACupSimple-v0", + "num_dof": 7, + "num_basis": 5, + "duration": 3.5, + "post_traj_time": 4.5, + "width": 0.0035, + "policy_type": "motor", + "weights_scale": 0.2, + "zero_start": True, + "zero_goal": True + } +) + register( id='ALRBallInACupGoalDMP-v0', entry_point='alr_envs.utils.make_env_helpers:make_contextual_env', diff --git a/alr_envs/classic_control/hole_reacher.py b/alr_envs/classic_control/hole_reacher.py index a4f1f33..1880ce3 100644 --- a/alr_envs/classic_control/hole_reacher.py +++ b/alr_envs/classic_control/hole_reacher.py @@ -7,10 +7,10 @@ from gym.utils import seeding from matplotlib import patches from alr_envs.classic_control.utils import check_self_collision -from alr_envs.utils.mps.mp_environments import MPEnv +from alr_envs.utils.mps.mp_environments import AlrEnv -class HoleReacherEnv(MPEnv): +class HoleReacherEnv(AlrEnv): def __init__(self, n_links: int, hole_x: Union[None, float] = None, hole_depth: Union[None, float] = None, hole_width: float = 1., random_start: bool = False, allow_self_collision: bool = False, @@ -71,11 +71,11 @@ class HoleReacherEnv(MPEnv): A single step with an action in joint velocity space """ + acc = (action - self._angle_velocity) / self.dt self._angle_velocity = action self._joint_angles = self._joint_angles + self.dt * self._angle_velocity self._update_joints() - acc = (action - self._angle_velocity) / self.dt reward, info = self._get_reward(acc) info.update({"is_collided": self._is_collided}) diff --git a/alr_envs/classic_control/simple_reacher.py b/alr_envs/classic_control/simple_reacher.py index 425134d..5b04aad 100644 --- a/alr_envs/classic_control/simple_reacher.py +++ b/alr_envs/classic_control/simple_reacher.py @@ -5,10 +5,10 @@ import numpy as np from gym import spaces from gym.utils import seeding -from alr_envs.utils.mps.mp_environments import MPEnv +from alr_envs.utils.mps.mp_environments import AlrEnv -class SimpleReacherEnv(MPEnv): +class SimpleReacherEnv(AlrEnv): """ Simple Reaching Task without any physics simulation. Returns no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions diff --git a/alr_envs/classic_control/viapoint_reacher.py b/alr_envs/classic_control/viapoint_reacher.py index 2897f31..8edcc69 100644 --- a/alr_envs/classic_control/viapoint_reacher.py +++ b/alr_envs/classic_control/viapoint_reacher.py @@ -6,10 +6,10 @@ import numpy as np from gym.utils import seeding from alr_envs.classic_control.utils import check_self_collision -from alr_envs.utils.mps.mp_environments import MPEnv +from alr_envs.utils.mps.mp_environments import AlrEnv -class ViaPointReacher(MPEnv): +class ViaPointReacher(AlrEnv): def __init__(self, n_links, random_start: bool = True, via_target: Union[None, Iterable] = None, target: Union[None, Iterable] = None, allow_self_collision=False, collision_penalty=1000): diff --git a/alr_envs/mujoco/alr_mujoco_env.py b/alr_envs/mujoco/alr_mujoco_env.py index edc90b6..8dbdabb 100644 --- a/alr_envs/mujoco/alr_mujoco_env.py +++ b/alr_envs/mujoco/alr_mujoco_env.py @@ -1,5 +1,6 @@ from collections import OrderedDict import os +from abc import abstractmethod from gym import error, spaces @@ -142,18 +143,20 @@ class AlrMujocoEnv(gym.Env): # methods to override: # ---------------------------- + @property + @abstractmethod + def active_obs(self): + """Returns boolean mask for each observation entry + whether the observation is returned for the contextual case or not. + This effectively allows to filter unwanted or unnecessary observations from the full step-based case. + """ + return np.ones(self.observation_space.shape, dtype=bool) + def _get_obs(self): """Returns the observation. """ raise NotImplementedError() - def configure(self, *args, **kwargs): - """ - Helper method to set certain environment properties such as contexts in contextual environments since reset() - doesn't take arguments. Should be called before reset(). - """ - pass - def reset_model(self): """ Reset the robot degrees of freedom (qpos and qvel). diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py index 66461c0..613b631 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py @@ -22,7 +22,7 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): self.j_min = np.array([-2.6, -1.985, -2.8, -0.9, -4.55, -1.5707, -2.7]) self.j_max = np.array([2.6, 1.985, 2.8, 3.14159, 1.25, 1.5707, 2.7]) - self.context = None + self.context = context utils.EzPickle.__init__(self) alr_mujoco_env.AlrMujocoEnv.__init__(self, @@ -45,7 +45,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): else: raise ValueError("Unknown reward type") self.reward_function = reward_function(self.sim_steps) - self.configure(context) @property def start_pos(self): @@ -69,10 +68,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): def current_vel(self): return self.sim.data.qvel[0:7].copy() - def configure(self, context): - self.context = context - self.reward_function.reset(context) - def reset_model(self): init_pos_all = self.init_qpos.copy() init_pos_robot = self._start_pos @@ -129,6 +124,16 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): [self._steps], ]) + # TODO + @property + def active_obs(self): + return np.hstack([ + [False] * 7, # cos + [False] * 7, # sin + # [True] * 2, # x-y coordinates of target distance + [False] # env steps + ]) + # These functions are for the task with 3 joint actuations def extend_des_pos(self, des_pos): des_pos_full = self._start_pos.copy() diff --git a/alr_envs/mujoco/beerpong/beerpong.py b/alr_envs/mujoco/beerpong/beerpong.py index dc6278d..5efc431 100644 --- a/alr_envs/mujoco/beerpong/beerpong.py +++ b/alr_envs/mujoco/beerpong/beerpong.py @@ -105,6 +105,7 @@ class ALRBeerpongEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): def check_traj_in_joint_limits(self): return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min) + # TODO def _get_obs(self): theta = self.sim.data.qpos.flat[:7] return np.concatenate([ @@ -114,6 +115,10 @@ class ALRBeerpongEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): [self._steps], ]) + # TODO + def active_obs(self): + pass + if __name__ == "__main__": diff --git a/alr_envs/utils/mp_env_async_sampler.py b/alr_envs/utils/mp_env_async_sampler.py index e935ba6..5cda41f 100644 --- a/alr_envs/utils/mp_env_async_sampler.py +++ b/alr_envs/utils/mp_env_async_sampler.py @@ -40,6 +40,16 @@ def _flatten_list(l): return [l__ for l_ in l for l__ in l_] +class DummyDist: + def __init__(self, dim): + self.dim = dim + + def sample(self, contexts): + contexts = np.atleast_2d(contexts) + n_samples = contexts.shape[0] + return np.random.normal(size=(n_samples, self.dim)), contexts + + class AlrMpEnvSampler: """ An asynchronous sampler for non contextual MPWrapper environments. A sampler object can be called with a set of @@ -100,9 +110,9 @@ class AlrContextualMpEnvSampler: if __name__ == "__main__": - env_name = "alr_envs:ALRBallInACupSimpleDMP-v0" + env_name = "alr_envs:HoleReacherDetPMP-v1" n_cpu = 8 - dim = 15 + dim = 25 n_samples = 10 sampler = AlrMpEnvSampler(env_name, num_envs=n_cpu) diff --git a/alr_envs/utils/mps/detpmp_wrapper.py b/alr_envs/utils/mps/detpmp_wrapper.py index 3f6d1ee..98060e2 100644 --- a/alr_envs/utils/mps/detpmp_wrapper.py +++ b/alr_envs/utils/mps/detpmp_wrapper.py @@ -2,12 +2,12 @@ import gym import numpy as np from mp_lib import det_promp -from alr_envs.utils.mps.mp_environments import MPEnv +from alr_envs.utils.mps.mp_environments import AlrEnv from alr_envs.utils.mps.mp_wrapper import MPWrapper class DetPMPWrapper(MPWrapper): - def __init__(self, env: MPEnv, num_dof: int, num_basis: int, width: int, duration: int = 1, dt: float = 0.01, + def __init__(self, env: AlrEnv, num_dof: int, num_basis: int, width: float, duration: float = 1, dt: float = 0.01, post_traj_time: float = 0., policy_type: str = None, weights_scale: float = 1., zero_start: bool = False, zero_goal: bool = False, **mp_kwargs): self.duration = duration # seconds @@ -15,15 +15,16 @@ class DetPMPWrapper(MPWrapper): super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, num_basis=num_basis, width=width, zero_start=zero_start, zero_goal=zero_goal, **mp_kwargs) - self.dt = dt + self.dt = env.dt if hasattr(env, "dt") else dt + assert self.dt is not None action_bounds = np.inf * np.ones((self.mp.n_basis * self.mp.n_dof)) self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, width: float = None, - zero_start: bool = False, zero_goal: bool = False): - pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=0.01, + off: float = 0.01, zero_start: bool = False, zero_goal: bool = False): + pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=off, zero_start=zero_start, zero_goal=zero_goal) weights = np.zeros(shape=(num_basis, num_dof)) @@ -32,10 +33,10 @@ class DetPMPWrapper(MPWrapper): return pmp def mp_rollout(self, action): - params = np.reshape(action, (self.mp.n_basis, self.mp.n_dof)) * self.weights_scale + params = np.reshape(action, newshape=(self.mp.n_basis, self.mp.n_dof)) * self.weights_scale self.mp.set_weights(self.duration, params) _, des_pos, des_vel, _ = self.mp.compute_trajectory(1 / self.dt, 1.) if self.mp.zero_start: - des_pos += self.start_pos[None, :] + des_pos += self.env.start_pos[None, :] return des_pos, des_vel diff --git a/alr_envs/utils/mps/dmp_wrapper.py b/alr_envs/utils/mps/dmp_wrapper.py index 6bc5bc6..4958a98 100644 --- a/alr_envs/utils/mps/dmp_wrapper.py +++ b/alr_envs/utils/mps/dmp_wrapper.py @@ -4,13 +4,13 @@ from mp_lib import dmps from mp_lib.basis import DMPBasisGenerator from mp_lib.phase import ExpDecayPhaseGenerator -from alr_envs.utils.mps.mp_environments import MPEnv +from alr_envs.utils.mps.mp_environments import AlrEnv from alr_envs.utils.mps.mp_wrapper import MPWrapper class DmpWrapper(MPWrapper): - def __init__(self, env: MPEnv, num_dof: int, num_basis: int, + def __init__(self, env: AlrEnv, num_dof: int, num_basis: int, duration: int = 1, alpha_phase: float = 2., dt: float = None, learn_goal: bool = False, post_traj_time: float = 0., weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3., @@ -40,7 +40,7 @@ class DmpWrapper(MPWrapper): super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, render_mode, num_basis=num_basis, alpha_phase=alpha_phase, bandwidth_factor=bandwidth_factor) - action_bounds = np.inf * np.ones((np.prod(self.mp.dmp_weights.shape) + (num_dof if learn_goal else 0))) + action_bounds = np.inf * np.ones((np.prod(self.mp.weights.shape) + (num_dof if learn_goal else 0))) self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, alpha_phase: float = 2., @@ -51,7 +51,7 @@ class DmpWrapper(MPWrapper): basis_bandwidth_factor=bandwidth_factor) dmp = dmps.DMP(num_dof=num_dof, basis_generator=basis_generator, phase_generator=phase_generator, - num_time_steps=int(duration / dt), dt=dt) + duration=duration, dt=dt) return dmp @@ -66,7 +66,7 @@ class DmpWrapper(MPWrapper): goal_pos = self.env.goal_pos assert goal_pos is not None - weight_matrix = np.reshape(params, self.mp.dmp_weights.shape) # [num_basis, num_dof] + weight_matrix = np.reshape(params, self.mp.weights.shape) # [num_basis, num_dof] return goal_pos * self.goal_scale, weight_matrix * self.weights_scale def mp_rollout(self, action): diff --git a/alr_envs/utils/mps/mp_environments.py b/alr_envs/utils/mps/mp_environments.py index f397491..60db99b 100644 --- a/alr_envs/utils/mps/mp_environments.py +++ b/alr_envs/utils/mps/mp_environments.py @@ -5,7 +5,7 @@ import gym import numpy as np -class MPEnv(gym.Env): +class AlrEnv(gym.Env): @property @abstractmethod diff --git a/alr_envs/utils/mps/mp_wrapper.py b/alr_envs/utils/mps/mp_wrapper.py index 4c92806..c31072f 100644 --- a/alr_envs/utils/mps/mp_wrapper.py +++ b/alr_envs/utils/mps/mp_wrapper.py @@ -3,13 +3,13 @@ from abc import ABC, abstractmethod import gym import numpy as np -from alr_envs.utils.mps.mp_environments import MPEnv +from alr_envs.utils.mps.mp_environments import AlrEnv from alr_envs.utils.policies import get_policy_class class MPWrapper(gym.Wrapper, ABC): - def __init__(self, env: MPEnv, num_dof: int, dt: float, duration: int = 1, post_traj_time: float = 0., + def __init__(self, env: AlrEnv, num_dof: int, dt: float, duration: float = 1, post_traj_time: float = 0., policy_type: str = None, weights_scale: float = 1., render_mode: str = None, **mp_kwargs): super().__init__(env) @@ -53,9 +53,6 @@ class MPWrapper(gym.Wrapper, ABC): return obs, np.array(rewards), dones, infos - def configure(self, context): - self.env.configure(context) - def reset(self): return self.env.reset()[self.env.active_obs] @@ -65,7 +62,7 @@ class MPWrapper(gym.Wrapper, ABC): if self.post_traj_steps > 0: trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])]) - velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.num_dimensions))]) + velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.n_dof))]) # self._trajectory = trajectory # self._velocity = velocity @@ -105,7 +102,7 @@ class MPWrapper(gym.Wrapper, ABC): raise NotImplementedError() @abstractmethod - def initialize_mp(self, num_dof: int, duration: int, dt: float, **kwargs): + def initialize_mp(self, num_dof: int, duration: float, dt: float, **kwargs): """ Create respective instance of MP Returns: diff --git a/example.py b/example.py index 166f38f..af30138 100644 --- a/example.py +++ b/example.py @@ -1,6 +1,7 @@ from collections import defaultdict import gym import numpy as np +from alr_envs.utils.mp_env_async_sampler import AlrMpEnvSampler, AlrContextualMpEnvSampler, DummyDist def example_mujoco(): @@ -22,9 +23,9 @@ def example_mujoco(): obs = env.reset() -def example_dmp(): +def example_mp(env_name="alr_envs:HoleReacherDMP-v0"): # env = gym.make("alr_envs:ViaPointReacherDMP-v0") - env = gym.make("alr_envs:HoleReacherDMP-v0") + env = gym.make(env_name) rewards = 0 # env.render(mode=None) obs = env.reset() @@ -79,9 +80,36 @@ def example_async(env_id="alr_envs:HoleReacherDMP-v0", n_cpu=4, seed=int('533D', print(sample(envs, 16)) +def example_async_sampler(env_name="alr_envs:HoleReacherDetPMP-v1", n_cpu=4): + n_samples = 10 + + sampler = AlrMpEnvSampler(env_name, num_envs=n_cpu) + dim = sampler.env.action_space.spaces[0].shape[0] + + thetas = np.random.randn(n_samples, dim) # usually form a search distribution + + _, rewards, __, ___ = sampler(thetas) + + print(rewards) + + +def example_async_contextual_sampler(env_name="alr_envs:SimpleReacherDMP-v1", n_cpu=4): + sampler = AlrContextualMpEnvSampler(env_name, num_envs=n_cpu) + dim = sampler.env.action_space.spaces[0].shape[0] + dist = DummyDist(dim) # needs a sample function + + n_samples = 10 + new_samples, new_contexts, obs, new_rewards, done, infos = sampler(dist, n_samples) + + print(new_rewards) + + if __name__ == '__main__': # example_mujoco() - # example_dmp() - example_async("alr_envs:LongSimpleReacherDMP-v0", 4) - # env = gym.make("alr_envs:HoleReacherDMP-v0", context=0.1) - # env = gym.make("alr_envs:HoleReacherDMP-v1") + # example_dmp("alr_envs:SimpleReacherDMP-v1") + # example_async("alr_envs:LongSimpleReacherDMP-v0", 4) + # example_async_contextual_sampler() + # env = gym.make("alr_envs:HoleReacherDetPMP-v1") + env_name = "alr_envs:ALRBallInACupSimpleDetPMP-v0" + # example_async_sampler(env_name) + example_mp(env_name) From 4aa31a004ab93813403e032b119fea24ca55523d Mon Sep 17 00:00:00 2001 From: Maximilian Huettenrauch Date: Thu, 27 May 2021 17:08:26 +0200 Subject: [PATCH 2/2] updates and bugfix in detpmp_wrapper --- alr_envs/__init__.py | 5 +- alr_envs/classic_control/hole_reacher.py | 4 +- .../mujoco/ball_in_a_cup/assets/biac_base.xml | 1 + .../mujoco/ball_in_a_cup/ball_in_a_cup.py | 13 ++--- .../ball_in_a_cup_reward_simple.py | 50 +++++++++++-------- alr_envs/utils/mps/detpmp_wrapper.py | 8 +-- 6 files changed, 46 insertions(+), 35 deletions(-) diff --git a/alr_envs/__init__.py b/alr_envs/__init__.py index 2a8054c..986265c 100644 --- a/alr_envs/__init__.py +++ b/alr_envs/__init__.py @@ -280,7 +280,7 @@ for v in versions: "num_dof": 5, "num_basis": 5, "duration": 2, - "width": 0.01, + "width": 0.025, "policy_type": "velocity", "weights_scale": 0.2, "zero_start": True @@ -352,7 +352,8 @@ register( "num_basis": 5, "duration": 3.5, "post_traj_time": 4.5, - "width": 0.005, + "width": 0.0035, + # "off": -0.05, "policy_type": "motor", "weights_scale": 0.2, "zero_start": True, diff --git a/alr_envs/classic_control/hole_reacher.py b/alr_envs/classic_control/hole_reacher.py index 1880ce3..730e7bf 100644 --- a/alr_envs/classic_control/hole_reacher.py +++ b/alr_envs/classic_control/hole_reacher.py @@ -73,12 +73,13 @@ class HoleReacherEnv(AlrEnv): acc = (action - self._angle_velocity) / self.dt self._angle_velocity = action - self._joint_angles = self._joint_angles + self.dt * self._angle_velocity + self._joint_angles = self._joint_angles + self.dt * self._angle_velocity # + 0.001 * np.random.randn(5) self._update_joints() reward, info = self._get_reward(acc) info.update({"is_collided": self._is_collided}) + self.end_effector_traj.append(np.copy(self.end_effector)) self._steps += 1 done = self._is_collided @@ -101,6 +102,7 @@ class HoleReacherEnv(AlrEnv): self._joints = np.zeros((self.n_links + 1, 2)) self._update_joints() self._steps = 0 + self.end_effector_traj = [] return self._get_obs().copy() diff --git a/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml b/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml index 58f0ac6..9229ad5 100644 --- a/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml +++ b/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml @@ -126,6 +126,7 @@ + diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py index 613b631..345b3ce 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py @@ -90,26 +90,27 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): reward_ctrl = - np.square(a).sum() crash = self.do_simulation(a) - joint_cons_viol = self.check_traj_in_joint_limits() + # joint_cons_viol = self.check_traj_in_joint_limits() self._q_pos.append(self.sim.data.qpos[0:7].ravel().copy()) self._q_vel.append(self.sim.data.qvel[0:7].ravel().copy()) ob = self._get_obs() - if not crash and not joint_cons_viol: - reward, success, stop_sim = self.reward_function.compute_reward(a, self.sim, self._steps) - done = success or self._steps == self.sim_steps - 1 or stop_sim + if not crash: + reward, success, is_collided = self.reward_function.compute_reward(a, self) + done = success or self._steps == self.sim_steps - 1 or is_collided self._steps += 1 else: - reward = -1000 + reward = -2 success = False + is_collided = False done = True return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl, velocity=angular_vel, traj=self._q_pos, is_success=success, - is_collided=crash or joint_cons_viol) + is_collided=is_collided, sim_crash=crash) def check_traj_in_joint_limits(self): return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min) diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py index 13053eb..79987d6 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py @@ -6,7 +6,8 @@ class BallInACupReward(alr_reward_fct.AlrReward): def __init__(self, sim_time): self.sim_time = sim_time - self.collision_objects = ["cup_geom1", "cup_geom2", "wrist_palm_link_convex_geom", + self.collision_objects = ["cup_geom1", "cup_geom2", "cup_base_contact_below", + "wrist_palm_link_convex_geom", "wrist_pitch_link_convex_decomposition_p1_geom", "wrist_pitch_link_convex_decomposition_p2_geom", "wrist_pitch_link_convex_decomposition_p3_geom", @@ -20,6 +21,8 @@ class BallInACupReward(alr_reward_fct.AlrReward): self.goal_id = None self.goal_final_id = None self.collision_ids = None + self._is_collided = False + self.collision_penalty = 1 self.ball_traj = None self.dists = None @@ -36,49 +39,52 @@ class BallInACupReward(alr_reward_fct.AlrReward): self.action_costs = [] self.cup_angles = [] - def compute_reward(self, action, sim, step, context=None): - self.ball_id = sim.model._body_name2id["ball"] - self.ball_collision_id = sim.model._geom_name2id["ball_geom"] - self.goal_id = sim.model._site_name2id["cup_goal"] - self.goal_final_id = sim.model._site_name2id["cup_goal_final"] - self.collision_ids = [sim.model._geom_name2id[name] for name in self.collision_objects] + def compute_reward(self, action, env): + self.ball_id = env.sim.model._body_name2id["ball"] + self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"] + self.goal_id = env.sim.model._site_name2id["cup_goal"] + self.goal_final_id = env.sim.model._site_name2id["cup_goal_final"] + self.collision_ids = [env.sim.model._geom_name2id[name] for name in self.collision_objects] - ball_in_cup = self.check_ball_in_cup(sim, self.ball_collision_id) + ball_in_cup = self.check_ball_in_cup(env.sim, self.ball_collision_id) # Compute the current distance from the ball to the inner part of the cup - goal_pos = sim.data.site_xpos[self.goal_id] - ball_pos = sim.data.body_xpos[self.ball_id] - goal_final_pos = sim.data.site_xpos[self.goal_final_id] + goal_pos = env.sim.data.site_xpos[self.goal_id] + ball_pos = env.sim.data.body_xpos[self.ball_id] + goal_final_pos = env.sim.data.site_xpos[self.goal_final_id] self.dists.append(np.linalg.norm(goal_pos - ball_pos)) self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) - self.ball_traj[step, :] = ball_pos - cup_quat = np.copy(sim.data.body_xquat[sim.model._body_name2id["cup"]]) + self.ball_traj[env._steps, :] = ball_pos + cup_quat = np.copy(env.sim.data.body_xquat[env.sim.model._body_name2id["cup"]]) self.cup_angles.append(np.arctan2(2 * (cup_quat[0] * cup_quat[1] + cup_quat[2] * cup_quat[3]), 1 - 2 * (cup_quat[1]**2 + cup_quat[2]**2))) action_cost = np.sum(np.square(action)) self.action_costs.append(action_cost) - if self.check_collision(sim): - reward = - 1000 - return reward, False, True + self._is_collided = self.check_collision(env.sim) or env.check_traj_in_joint_limits() - if step == self.sim_time - 1: + if env._steps == env.sim_steps - 1 or self._is_collided: t_min_dist = np.argmin(self.dists) angle_min_dist = self.cup_angles[t_min_dist] cost_angle = (angle_min_dist - np.pi / 2)**2 min_dist = self.dists[t_min_dist] dist_final = self.dists_final[-1] + min_dist_final = np.min(self.dists_final) - cost = 0.5 * min_dist + 0.5 * dist_final + 0.01 * cost_angle - reward = np.exp(-2 * cost) - 1e-3 * action_cost - success = dist_final < 0.05 and ball_in_cup + cost = 0.5 * dist_final + 0.05 * cost_angle # TODO: Increase cost_angle weight # 0.5 * min_dist + + # reward = np.exp(-2 * cost) - 1e-2 * action_cost - self.collision_penalty * int(self._is_collided) + # reward = - dist_final**2 - 1e-4 * cost_angle - 1e-5 * action_cost - self.collision_penalty * int(self._is_collided) + reward = - dist_final**2 - min_dist_final**2 - 1e-4 * cost_angle - 1e-5 * action_cost - self.collision_penalty * int(self._is_collided) + success = dist_final < 0.05 and ball_in_cup and not self._is_collided + crash = self._is_collided else: - reward = - 1e-3 * action_cost + reward = - 1e-5 * action_cost # TODO: increase action_cost weight success = False + crash = False - return reward, success, False + return reward, success, crash def check_ball_in_cup(self, sim, ball_collision_id): cup_base_collision_id = sim.model._geom_name2id["cup_base_contact"] diff --git a/alr_envs/utils/mps/detpmp_wrapper.py b/alr_envs/utils/mps/detpmp_wrapper.py index 98060e2..8661781 100644 --- a/alr_envs/utils/mps/detpmp_wrapper.py +++ b/alr_envs/utils/mps/detpmp_wrapper.py @@ -12,16 +12,16 @@ class DetPMPWrapper(MPWrapper): zero_start: bool = False, zero_goal: bool = False, **mp_kwargs): self.duration = duration # seconds + dt = env.dt if hasattr(env, "dt") else dt + assert dt is not None + self.dt = dt + super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, num_basis=num_basis, width=width, zero_start=zero_start, zero_goal=zero_goal, **mp_kwargs) - self.dt = env.dt if hasattr(env, "dt") else dt - assert self.dt is not None - action_bounds = np.inf * np.ones((self.mp.n_basis * self.mp.n_dof)) self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) - def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, width: float = None, off: float = 0.01, zero_start: bool = False, zero_goal: bool = False): pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=off,