diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index 552adfa..fa9b71b 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -391,8 +391,7 @@ register( max_episode_steps=600, kwargs={ "rndm_goal": False, - "cup_goal_pos": [0.1, -2.0], - "learn_release_step": True + "cup_goal_pos": [0.1, -2.0] } ) @@ -404,8 +403,7 @@ register( max_episode_steps=600, kwargs={ "rndm_goal": True, - "cup_goal_pos": [-0.3, -1.2], - "learn_release_step": True + "cup_goal_pos": [-0.3, -1.2] } ) @@ -646,7 +644,7 @@ for _v in _versions: }, "movement_primitives_kwargs": { 'movement_primitives_type': 'promp', - 'num_dof': 7 + 'action_dim': 7 }, "phase_generator_kwargs": { 'phase_generator_type': 'linear', @@ -658,7 +656,9 @@ for _v in _versions: "controller_kwargs": { 'controller_type': 'motor', "p_gains": np.array([1.5, 5, 2.55, 3, 2., 2, 1.25]), + # "p_gains": 0.125*np.array([200, 300, 100, 100, 10, 10, 2.5]), "d_gains": np.array([0.02333333, 0.1, 0.0625, 0.08, 0.03, 0.03, 0.0125]), + # "d_gains": 0.025*np.array([7, 15, 5, 2.5, 0.3, 0.3, 0.05]), }, "basis_generator_kwargs": { 'basis_generator_type': 'zero_rbf', @@ -753,29 +753,92 @@ for _v in _versions: ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) +# ## HopperJump +# _versions = ["v0", "v1"] +# for _v in _versions: +# _env_id = f'ALRHopperJumpProMP-{_v}' +# register( +# id= _env_id, +# entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', +# kwargs={ +# "name": f"alr_envs:ALRHopperJump-{_v}", +# "wrappers": [mujoco.hopper_jump.MPWrapper], +# "mp_kwargs": { +# "num_dof": 3, +# "num_basis": 5, +# "duration": 2, +# "post_traj_time": 0, +# "policy_type": "motor", +# "weights_scale": 1.0, +# "zero_start": True, +# "zero_goal": False, +# "policy_kwargs": { +# "p_gains": np.ones(3), +# "d_gains": 0.1*np.ones(3) +# } +# } +# } +# ) +# ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) + +# ## HopperJump +# register( +# id= "ALRHopperJumpProMP-v2", +# entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', +# kwargs={ +# "name": f"alr_envs:ALRHopperJump-v2", +# "wrappers": [mujoco.hopper_jump.HighCtxtMPWrapper], +# "mp_kwargs": { +# "num_dof": 3, +# "num_basis": 5, +# "duration": 2, +# "post_traj_time": 0, +# "policy_type": "motor", +# "weights_scale": 1.0, +# "zero_start": True, +# "zero_goal": False, +# "policy_kwargs": { +# "p_gains": np.ones(3), +# "d_gains": 0.1*np.ones(3) +# } +# } +# } +# ) +# ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("ALRHopperJumpProMP-v2") + ## HopperJump _versions = ["v0", "v1"] for _v in _versions: _env_id = f'ALRHopperJumpProMP-{_v}' register( id= _env_id, - entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper', kwargs={ "name": f"alr_envs:ALRHopperJump-{_v}", - "wrappers": [mujoco.hopper_jump.MPWrapper], - "mp_kwargs": { - "num_dof": 3, - "num_basis": 5, - "duration": 2, - "post_traj_time": 0, - "policy_type": "motor", - "weights_scale": 1.0, - "zero_start": True, - "zero_goal": False, - "policy_kwargs": { - "p_gains": np.ones(3), - "d_gains": 0.1*np.ones(3) - } + "wrappers": [mujoco.hopper_jump.NewMPWrapper], + "ep_wrapper_kwargs": { + "weight_scale": 1 + }, + "movement_primitives_kwargs": { + 'movement_primitives_type': 'promp', + 'action_dim': 3 + }, + "phase_generator_kwargs": { + 'phase_generator_type': 'linear', + 'delay': 0, + 'tau': 2, # initial value + 'learn_tau': False, + 'learn_delay': False + }, + "controller_kwargs": { + 'controller_type': 'motor', + "p_gains": np.ones(3), + "d_gains": 0.1*np.ones(3), + }, + "basis_generator_kwargs": { + 'basis_generator_type': 'zero_rbf', + 'num_basis': 5, + 'num_basis_zero_start': 2 } } ) @@ -784,25 +847,35 @@ for _v in _versions: ## HopperJump register( id= "ALRHopperJumpProMP-v2", - entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper', kwargs={ "name": f"alr_envs:ALRHopperJump-v2", - "wrappers": [mujoco.hopper_jump.HighCtxtMPWrapper], - "mp_kwargs": { - "num_dof": 3, - "num_basis": 5, - "duration": 2, - "post_traj_time": 0, - "policy_type": "motor", - "weights_scale": 1.0, - "zero_start": True, - "zero_goal": False, - "policy_kwargs": { + "wrappers": [mujoco.hopper_jump.NewHighCtxtMPWrapper], + "ep_wrapper_kwargs": { + "weight_scale": 1 + }, + "movement_primitives_kwargs": { + 'movement_primitives_type': 'promp', + 'action_dim': 3 + }, + "phase_generator_kwargs": { + 'phase_generator_type': 'linear', + 'delay': 0, + 'tau': 2, # initial value + 'learn_tau': False, + 'learn_delay': False + }, + "controller_kwargs": { + 'controller_type': 'motor', "p_gains": np.ones(3), - "d_gains": 0.1*np.ones(3) + "d_gains": 0.1*np.ones(3), + }, + "basis_generator_kwargs": { + 'basis_generator_type': 'zero_rbf', + 'num_basis': 5, + 'num_basis_zero_start': 2 } } - } ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("ALRHopperJumpProMP-v2") @@ -916,11 +989,4 @@ for _v in _versions: } } ) - ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) - - -# --------------------- Testing new mp wrapper ----------------------------------------------------- - -# register( -# id='ALRReacherProMP-v0' -# ) \ No newline at end of file + ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) \ No newline at end of file diff --git a/alr_envs/alr/mujoco/__init__.py b/alr_envs/alr/mujoco/__init__.py index 9c7eecf..fb86186 100644 --- a/alr_envs/alr/mujoco/__init__.py +++ b/alr_envs/alr/mujoco/__init__.py @@ -1,4 +1,3 @@ -from .reacher.alr_reacher import ALRReacherEnv, ALRReacherOptCtrlEnv from .reacher.balancing import BalancingEnv from .ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv from .ball_in_a_cup.biac_pd import ALRBallInACupPDEnv @@ -10,4 +9,5 @@ from .hopper_jump.hopper_jump import ALRHopperJumpEnv, ALRHopperJumpRndmPosEnv from .hopper_jump.hopper_jump_on_box import ALRHopperJumpOnBoxEnv from .hopper_throw.hopper_throw import ALRHopperThrowEnv from .hopper_throw.hopper_throw_in_basket import ALRHopperThrowInBasketEnv -from .walker_2d_jump.walker_2d_jump import ALRWalker2dJumpEnv \ No newline at end of file +from .walker_2d_jump.walker_2d_jump import ALRWalker2dJumpEnv +from .reacher.alr_reacher import ALRReacherEnv \ No newline at end of file diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py index 661d10c..36998b9 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong.py @@ -3,7 +3,6 @@ import os import numpy as np from gym import utils -from gym import spaces from gym.envs.mujoco import MujocoEnv from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward @@ -18,7 +17,7 @@ CUP_POS_MAX = np.array([0.32, -1.2]) class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, - rndm_goal=False, learn_release_step=True, cup_goal_pos=None): + rndm_goal=False, cup_goal_pos=None): cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840]) if cup_goal_pos.shape[0]==2: cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840) @@ -43,7 +42,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): # self._release_step = 175 # time step of ball release # self._release_step = 130 # time step of ball release - self._release_step = 100 # time step of ball release + self.release_step = 100 # time step of ball release self.ep_length = 600 # based on 3 seconds with dt = 0.005 int(self.sim_time / self.dt) self.cup_table_id = 10 @@ -52,7 +51,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): self.noise_std = 0.01 else: self.noise_std = 0 - self.learn_release_step = learn_release_step reward_function = BeerPongReward self.reward_function = reward_function() @@ -63,13 +61,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): def start_pos(self): return self._start_pos - def _set_action_space(self): - bounds = self.model.actuator_ctrlrange.copy().astype(np.float32) - bounds = np.concatenate((bounds, [[50, self.ep_length*0.333]]), axis=0) - low, high = bounds.T - self.action_space = spaces.Box(low=low, high=high, dtype=np.float32) - return self.action_space - @property def start_vel(self): return self._start_vel @@ -109,21 +100,22 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): return self._get_obs() def step(self, a): - self._release_step = a[-1] if self.learn_release_step else self._release_step - self._release_step = np.clip(self._release_step, self.action_space.low[-1], self.action_space.high[-1]) \ - if self.learn_release_step else self._release_step + # if a.shape[0] == 8: # we learn also when to release the ball + # self._release_step = a[-1] + # self._release_step = np.clip(self._release_step, 50, 250) + # self.release_step = 0.5/self.dt reward_dist = 0.0 angular_vel = 0.0 - applied_action = a[:a.shape[0]-int(self.learn_release_step)] + applied_action = a reward_ctrl = - np.square(applied_action).sum() if self.apply_gravity_comp: applied_action += self.sim.data.qfrc_bias[:len(applied_action)].copy() / self.model.actuator_gear[:, 0] try: self.do_simulation(applied_action, self.frame_skip) - if self._steps < self._release_step: + if self._steps < self.release_step: self.sim.data.qpos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy() self.sim.data.qvel[7::] = self.sim.data.site_xvelp[self.ball_site_id, :].copy() - elif self._steps == self._release_step and self.add_noise: + elif self._steps == self.release_step and self.add_noise: self.sim.data.qvel[7::] += self.noise_std * np.random.randn(3) crash = False except mujoco_py.builder.MujocoException: @@ -160,7 +152,8 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): ball_pos=ball_pos, ball_vel=ball_vel, success=success, - is_collided=is_collided, sim_crash=crash) + is_collided=is_collided, sim_crash=crash, + table_contact_first=int(not self.reward_function.ball_ground_contact_first)) infos.update(reward_infos) return ob, reward, done, infos diff --git a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py index 910763a..bb5dd3f 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py @@ -162,12 +162,16 @@ class BeerPongReward: min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, 0 reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \ 1e-4 * np.mean(action_cost) - if env.learn_release_step and not self.ball_in_cup: - too_small = (env._release_step<50)*(env._release_step-50)**2 - too_big = (env._release_step>200)*0.2*(env._release_step-200)**2 - reward = reward - too_small -too_big # 1e-7*np.mean(action_cost) + # release step punishment + min_time_bound = 0.1 + max_time_bound = 1.0 + release_time = env.release_step*env.dt + release_time_rew = int(release_timemax_time_bound)*(-30-10*(release_time-max_time_bound)**2) + reward += release_time_rew success = self.ball_in_cup + # print('release time :', release_time) else: reward = - 1e-2 * action_cost # reward = - 1e-4 * action_cost diff --git a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py index 267e76e..7c4ee9d 100644 --- a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py +++ b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py @@ -21,24 +21,35 @@ class NewMPWrapper(EpisodicWrapper): [False] # env steps ]) - def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]: - if self.env.learn_release_step: - return np.concatenate((step_action, np.atleast_1d(env_spec_params))) - else: - return step_action + # def set_mp_action_space(self): + # min_action_bounds, max_action_bounds = self.mp.get_param_bounds() + # if self.mp.learn_tau: + # min_action_bounds[0] = 20*self.env.dt + # max_action_bounds[0] = 260*self.env.dt + # mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(), + # dtype=np.float32) + # return mp_action_space + + # def _step_callback(self, t: int, env_spec_params: Union[np.ndarray, None], step_action: np.ndarray) -> Union[np.ndarray]: + # if self.mp.learn_tau: + # return np.concatenate((step_action, np.atleast_1d(env_spec_params))) + # else: + # return step_action def _episode_callback(self, action: np.ndarray) -> Tuple[np.ndarray, Union[np.ndarray, None]]: - if self.env.learn_release_step: - return action[:-1], action[-1] # mp_params, release step + if self.mp.learn_tau: + self.env.env.release_step = action[0]/self.env.dt # Tau value + # self.env.env.release_step = np.clip(action[0]/self.env.dt, 20, 260) # Tau value + return action, None else: return action, None - def set_action_space(self): - if self.env.learn_release_step: - min_action_bounds, max_action_bounds = self.mp.get_param_bounds() - min_action_bounds = np.concatenate((min_action_bounds.numpy(), [self.env.action_space.low[-1]])) - max_action_bounds = np.concatenate((max_action_bounds.numpy(), [self.env.action_space.high[-1]])) - self.action_space = gym.spaces.Box(low=min_action_bounds, high=max_action_bounds, dtype=np.float32) - return self.action_space - else: - return super(NewMPWrapper, self).set_action_space() + # def set_action_space(self): + # if self.mp.learn_tau: + # min_action_bounds, max_action_bounds = self.mp.get_param_bounds() + # min_action_bounds = np.concatenate((min_action_bounds.numpy(), [self.env.action_space.low[-1]])) + # max_action_bounds = np.concatenate((max_action_bounds.numpy(), [self.env.action_space.high[-1]])) + # self.action_space = gym.spaces.Box(low=min_action_bounds, high=max_action_bounds, dtype=np.float32) + # return self.action_space + # else: + # return super(NewMPWrapper, self).set_action_space() diff --git a/alr_envs/alr/mujoco/hopper_jump/__init__.py b/alr_envs/alr/mujoco/hopper_jump/__init__.py index c6db9c2..fbffe48 100644 --- a/alr_envs/alr/mujoco/hopper_jump/__init__.py +++ b/alr_envs/alr/mujoco/hopper_jump/__init__.py @@ -1 +1,2 @@ from .mp_wrapper import MPWrapper, HighCtxtMPWrapper +from .new_mp_wrapper import NewMPWrapper, NewHighCtxtMPWrapper diff --git a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py index e76cf4f..9a74bb0 100644 --- a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py +++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py @@ -80,8 +80,6 @@ class ALRHopperJumpEnv(HopperEnv): # overwrite reset_model to make it deterministic def reset_model(self): - noise_low = -self._reset_noise_scale - noise_high = self._reset_noise_scale qpos = self.init_qpos # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) qvel = self.init_qvel # + self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) @@ -104,14 +102,26 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv): def reset_model(self): self._floor_geom_id = self.model.geom_name2id('floor') self._foot_geom_id = self.model.geom_name2id('foot_geom') - noise_low = -self._reset_noise_scale - noise_high = self._reset_noise_scale + noise_low = -np.ones(self.model.nq)*self._reset_noise_scale + noise_low[1] = 0 + noise_low[2] = -0.3 + noise_low[3] = -0.1 + noise_low[4] = -1.1 + noise_low[5] = -0.785 + + noise_high = np.ones(self.model.nq)*self._reset_noise_scale + noise_high[1] = 0 + noise_high[2] = 0.3 + noise_high[3] = 0 + noise_high[4] = 0 + noise_high[5] = 0.785 + rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) - rnd_vec[2] *= 0.05 # the angle around the y axis shouldn't be too high as the agent then falls down quickly and + # rnd_vec[2] *= 0.05 # the angle around the y axis shouldn't be too high as the agent then falls down quickly and # can not recover - rnd_vec[1] = np.clip(rnd_vec[1], 0, 0.3) + # rnd_vec[1] = np.clip(rnd_vec[1], 0, 0.3) qpos = self.init_qpos + rnd_vec - qvel = self.init_qvel #+ self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nv) + qvel = self.init_qvel self.set_state(qpos, qvel) @@ -167,16 +177,19 @@ if __name__ == '__main__': env = ALRHopperJumpRndmPosEnv() obs = env.reset() - for i in range(2000): - # objective.load_result("/tmp/cma") - # test with random actions - ac = env.action_space.sample() - obs, rew, d, info = env.step(ac) - # if i % 10 == 0: - # env.render(mode=render_mode) - env.render(mode=render_mode) - if d: - print('After ', i, ' steps, done: ', d) - env.reset() + for k in range(10): + obs = env.reset() + print('observation :', obs[:6]) + for i in range(200): + # objective.load_result("/tmp/cma") + # test with random actions + ac = env.action_space.sample() + obs, rew, d, info = env.step(ac) + # if i % 10 == 0: + # env.render(mode=render_mode) + env.render(mode=render_mode) + if d: + print('After ', i, ' steps, done: ', d) + env.reset() env.close() \ No newline at end of file diff --git a/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py b/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py new file mode 100644 index 0000000..9932403 --- /dev/null +++ b/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py @@ -0,0 +1,29 @@ +from alr_envs.mp.episodic_wrapper import EpisodicWrapper +from typing import Union, Tuple +import numpy as np + + +class NewMPWrapper(EpisodicWrapper): + @property + def current_pos(self) -> Union[float, int, np.ndarray, Tuple]: + return self.env.sim.data.qpos[3:6].copy() + + @property + def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: + return self.env.sim.data.qvel[3:6].copy() + + def set_active_obs(self): + return np.hstack([ + [False] * (5 + int(not self.env.exclude_current_positions_from_observation)), # position + [False] * 6, # velocity + [True] + ]) + + +class NewHighCtxtMPWrapper(NewMPWrapper): + def set_active_obs(self): + return np.hstack([ + [True] * (5 + int(not self.env.exclude_current_positions_from_observation)), # position + [False] * 6, # velocity + [False] + ]) \ No newline at end of file diff --git a/alr_envs/alr/mujoco/reacher/alr_reacher.py b/alr_envs/alr/mujoco/reacher/alr_reacher.py index b436fdd..6b36407 100644 --- a/alr_envs/alr/mujoco/reacher/alr_reacher.py +++ b/alr_envs/alr/mujoco/reacher/alr_reacher.py @@ -68,35 +68,35 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle): def viewer_setup(self): self.viewer.cam.trackbodyid = 0 - # def reset_model(self): - # qpos = self.init_qpos - # if not hasattr(self, "goal"): - # self.goal = np.array([-0.25, 0.25]) - # # self.goal = self.init_qpos.copy()[:2] + 0.05 - # qpos[-2:] = self.goal - # qvel = self.init_qvel - # qvel[-2:] = 0 - # self.set_state(qpos, qvel) - # self._steps = 0 - # - # return self._get_obs() - def reset_model(self): - qpos = self.init_qpos.copy() - while True: - self.goal = self.np_random.uniform(low=-self.n_links / 10, high=self.n_links / 10, size=2) - # self.goal = self.np_random.uniform(low=0, high=self.n_links / 10, size=2) - # self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=[0, self.n_links / 10], size=2) - if np.linalg.norm(self.goal) < self.n_links / 10: - break + qpos = self.init_qpos + if not hasattr(self, "goal"): + self.goal = np.array([-0.25, 0.25]) + # self.goal = self.init_qpos.copy()[:2] + 0.05 qpos[-2:] = self.goal - qvel = self.init_qvel.copy() + qvel = self.init_qvel qvel[-2:] = 0 self.set_state(qpos, qvel) self._steps = 0 return self._get_obs() + # def reset_model(self): + # qpos = self.init_qpos.copy() + # while True: + # self.goal = self.np_random.uniform(low=-self.n_links / 10, high=self.n_links / 10, size=2) + # # self.goal = self.np_random.uniform(low=0, high=self.n_links / 10, size=2) + # # self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=[0, self.n_links / 10], size=2) + # if np.linalg.norm(self.goal) < self.n_links / 10: + # break + # qpos[-2:] = self.goal + # qvel = self.init_qvel.copy() + # qvel[-2:] = 0 + # self.set_state(qpos, qvel) + # self._steps = 0 + # + # return self._get_obs() + # def reset_model(self): # qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos # while True: diff --git a/alr_envs/mp/episodic_wrapper.py b/alr_envs/mp/episodic_wrapper.py index 092f1bc..7426c15 100644 --- a/alr_envs/mp/episodic_wrapper.py +++ b/alr_envs/mp/episodic_wrapper.py @@ -50,13 +50,10 @@ class EpisodicWrapper(gym.Env, ABC): # rendering self.render_mode = render_mode self.render_kwargs = {} - # self.time_steps = np.linspace(0, self.duration, self.traj_steps + 1) self.time_steps = np.linspace(0, self.duration, self.traj_steps) self.mp.set_mp_times(self.time_steps) # action_bounds = np.inf * np.ones((np.prod(self.mp.num_params))) - min_action_bounds, max_action_bounds = mp.get_param_bounds() - self.mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(), - dtype=np.float32) + self.mp_action_space = self.set_mp_action_space() self.action_space = self.set_action_space() self.active_obs = self.set_active_obs() @@ -65,7 +62,7 @@ class EpisodicWrapper(gym.Env, ABC): dtype=self.env.observation_space.dtype) def get_trajectory(self, action: np.ndarray) -> Tuple: - # TODO: this follows the implementation of the mp_pytorch library which includes the paramters tau and delay at + # TODO: this follows the implementation of the mp_pytorch library which includes the parameters tau and delay at # the beginning of the array. ignore_indices = int(self.mp.learn_tau) + int(self.mp.learn_delay) scaled_mp_params = action.copy() @@ -84,6 +81,13 @@ class EpisodicWrapper(gym.Env, ABC): return trajectory, velocity + def set_mp_action_space(self): + """This function can be used to set up an individual space for the parameters of the mp.""" + min_action_bounds, max_action_bounds = self.mp.get_param_bounds() + mp_action_space = gym.spaces.Box(low=min_action_bounds.numpy(), high=max_action_bounds.numpy(), + dtype=np.float32) + return mp_action_space + def set_action_space(self): """ This function can be used to modify the action space for considering actions which are not learned via motion @@ -179,6 +183,7 @@ class EpisodicWrapper(gym.Env, ABC): step_action = self.controller.get_action(pos_vel[0], pos_vel[1], self.current_pos, self.current_vel) step_action = self._step_callback(t, env_spec_params, step_action) # include possible callback info c_action = np.clip(step_action, self.env.action_space.low, self.env.action_space.high) + # print('step/clipped action ratio: ', step_action/c_action) obs, c_reward, done, info = self.env.step(c_action) if self.verbose >= 2: actions[t, :] = c_action diff --git a/alr_envs/utils/make_env_helpers.py b/alr_envs/utils/make_env_helpers.py index 0c06906..f22e0d7 100644 --- a/alr_envs/utils/make_env_helpers.py +++ b/alr_envs/utils/make_env_helpers.py @@ -156,12 +156,11 @@ def make_mp_from_kwargs( ep_wrapper_kwargs['duration'] = dummy_env.spec.max_episode_steps*dummy_env.dt if phase_kwargs.get('tau', None) is None: phase_kwargs['tau'] = ep_wrapper_kwargs['duration'] - action_dim = mp_kwargs.pop('num_dof', None) - action_dim = action_dim if action_dim is not None else np.prod(dummy_env.action_space.shape).item() + mp_kwargs['action_dim'] = mp_kwargs.get('action_dim', np.prod(dummy_env.action_space.shape).item()) phase_gen = get_phase_generator(**phase_kwargs) basis_gen = get_basis_generator(phase_generator=phase_gen, **basis_kwargs) controller = get_controller(**controller_kwargs) - mp = get_movement_primitive(action_dim=action_dim, basis_generator=basis_gen, **mp_kwargs) + mp = get_movement_primitive(basis_generator=basis_gen, **mp_kwargs) _env = _make_wrapped_env(env_id=env_id, wrappers=wrappers, mp=mp, controller=controller, ep_wrapper_kwargs=ep_wrapper_kwargs, seed=seed, **kwargs) return _env