diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index d3e976c..09f533a 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -203,6 +203,34 @@ register( } ) +_vs = np.arange(101).tolist() + [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 5e-1] +for i in _vs: + _env_id = f'ALRReacher{i}-v0' + register( + id=_env_id, + entry_point='alr_envs.alr.mujoco:ALRReacherEnv', + max_episode_steps=200, + kwargs={ + "steps_before_reward": 0, + "n_links": 5, + "balance": False, + 'ctrl_cost_weight': i + } + ) + + _env_id = f'ALRReacherSparse{i}-v0' + register( + id=_env_id, + entry_point='alr_envs.alr.mujoco:ALRReacherEnv', + max_episode_steps=200, + kwargs={ + "steps_before_reward": 200, + "n_links": 5, + "balance": False, + 'ctrl_cost_weight': i + } + ) + # CtxtFree are v0, Contextual are v1 register( id='ALRAntJump-v0', @@ -458,6 +486,18 @@ register( } ) +# Beerpong with episodic reward, but fixed release time step +register( + id='ALRBeerPong-v4', + entry_point='alr_envs.alr.mujoco:ALRBeerBongEnvFixedReleaseStep', + max_episode_steps=300, + kwargs={ + "rndm_goal": True, + "cup_goal_pos": [-0.3, -1.2], + "frame_skip": 2 + } + ) + # Motion Primitive Environments ## Simple Reacher @@ -648,6 +688,56 @@ for _v in _versions: ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) + +_vs = np.arange(101).tolist() + [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 5e-1] +for i in _vs: + _env_id = f'ALRReacher{i}ProMP-v0' + register( + id=_env_id, + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": f"alr_envs:{_env_id.replace('ProMP', '')}", + "wrappers": [mujoco.reacher.MPWrapper], + "mp_kwargs": { + "num_dof": 5, + "num_basis": 5, + "duration": 4, + "policy_type": "motor", + # "weights_scale": 5, + "n_zero_basis": 1, + "zero_start": True, + "policy_kwargs": { + "p_gains": 1, + "d_gains": 0.1 + } + } + } + ) + + _env_id = f'ALRReacherSparse{i}ProMP-v0' + register( + id=_env_id, + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": f"alr_envs:{_env_id.replace('ProMP', '')}", + "wrappers": [mujoco.reacher.MPWrapper], + "mp_kwargs": { + "num_dof": 5, + "num_basis": 5, + "duration": 4, + "policy_type": "motor", + # "weights_scale": 5, + "n_zero_basis": 1, + "zero_start": True, + "policy_kwargs": { + "p_gains": 1, + "d_gains": 0.1 + } + } + } + ) + + # ## Beerpong # _versions = ["v0", "v1"] # for _v in _versions: @@ -717,6 +807,42 @@ for _v in _versions: ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) +## Beerpong ProMP fixed release +_env_id = 'BeerpongProMP-v2' +register( + id=_env_id, + entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper', + kwargs={ + "name": "alr_envs:ALRBeerPong-v4", + "wrappers": [mujoco.beerpong.NewMPWrapper], + "ep_wrapper_kwargs": { + "weight_scale": 1 + }, + "movement_primitives_kwargs": { + 'movement_primitives_type': 'promp', + 'action_dim': 7 + }, + "phase_generator_kwargs": { + 'phase_generator_type': 'linear', + 'delay': 0, + 'tau': 0.62, # initial value + 'learn_tau': False, + 'learn_delay': False + }, + "controller_kwargs": { + 'controller_type': 'motor', + "p_gains": np.array([1.5, 5, 2.55, 3, 2., 2, 1.25]), + "d_gains": np.array([0.02333333, 0.1, 0.0625, 0.08, 0.03, 0.03, 0.0125]), + }, + "basis_generator_kwargs": { + 'basis_generator_type': 'zero_rbf', + 'num_basis': 2, + 'num_basis_zero_start': 2 + } + } +) +ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) + ## Table Tennis ctxt_dim = [2, 4] for _v, cd in enumerate(ctxt_dim): diff --git a/alr_envs/alr/mujoco/__init__.py b/alr_envs/alr/mujoco/__init__.py index 2b19331..2885321 100644 --- a/alr_envs/alr/mujoco/__init__.py +++ b/alr_envs/alr/mujoco/__init__.py @@ -2,7 +2,7 @@ from .reacher.balancing import BalancingEnv from .ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv from .ball_in_a_cup.biac_pd import ALRBallInACupPDEnv from .table_tennis.tt_gym import TTEnvGym -from .beerpong.beerpong import ALRBeerBongEnv, ALRBeerBongEnvStepBased, ALRBeerBongEnvStepBasedEpisodicReward +from .beerpong.beerpong import ALRBeerBongEnv, ALRBeerBongEnvStepBased, ALRBeerBongEnvStepBasedEpisodicReward, ALRBeerBongEnvFixedReleaseStep from .ant_jump.ant_jump import ALRAntJumpEnv from .half_cheetah_jump.half_cheetah_jump import ALRHalfCheetahJumpEnv from .hopper_jump.hopper_jump import ALRHopperJumpEnv, ALRHopperJumpRndmPosEnv, ALRHopperXYJumpEnv, ALRHopperXYJumpEnvStepBased diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py index 9028bd0..5040ec7 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong.py @@ -185,6 +185,10 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): def dt(self): return super(ALRBeerBongEnv, self).dt*self.repeat_action +class ALRBeerBongEnvFixedReleaseStep(ALRBeerBongEnv): + def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): + super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) + self.release_step = 62 # empirically evaluated for frame_skip=2! class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv): def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): @@ -206,6 +210,25 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv): # internal steps and thus, the observation also needs to be set correctly return ob, reward, done, infos + +# class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv): +# def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): +# super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) +# self.release_step = 62 # empirically evaluated for frame_skip=2! +# +# def step(self, a): +# if self._steps < self.release_step: +# return super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(a) +# else: +# sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape)) +# reward = sub_reward +# infos = sub_infos +# ob = sub_ob +# ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the +# # internal steps and thus, the observation also needs to be set correctly +# return ob, reward, done, infos + + class ALRBeerBongEnvStepBased(ALRBeerBongEnv): def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) @@ -259,7 +282,8 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv): if __name__ == "__main__": # env = ALRBeerBongEnv(rndm_goal=True) # env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True) - env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True) + # env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True) + env = ALRBeerBongEnvFixedReleaseStep(frame_skip=2, rndm_goal=True) import time env.reset() env.render("human") diff --git a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py index 2759fd4..20ede40 100644 --- a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py +++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py @@ -123,7 +123,6 @@ class ALRHopperJumpEnv(HopperEnv): class ALRHopperXYJumpEnv(ALRHopperJumpEnv): def step(self, action): - self._floor_geom_id = self.model.geom_name2id('floor') self._foot_geom_id = self.model.geom_name2id('foot_geom') @@ -173,7 +172,8 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv): 'goal_dist': goal_dist, 'height_rew': self.max_height, 'healthy_reward': self.healthy_reward * 2, - 'healthy': self.is_healthy + 'healthy': self.is_healthy, + 'contact_dist': self.contact_dist if self.contact_dist is not None else 0 } return observation, reward, done, info @@ -194,7 +194,6 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv): rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) qpos = self.init_qpos + rnd_vec qvel = self.init_qvel - self.set_state(qpos, qvel) observation = self._get_obs() @@ -207,9 +206,6 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv): def reset(self): super().reset() - # self.goal = np.random.uniform(-1.5, 1.5, 1) - # self.goal = np.random.uniform(0, 1.5, 1) - # self.goal = self.np_random.uniform(0, 1.5, 1) self.goal = self.np_random.uniform(0.3, 1.35, 1)[0] self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0]) return self.reset_model() @@ -219,6 +215,16 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv): - np.array([self.goal, 0, 0]) return np.concatenate((super(ALRHopperXYJumpEnv, self)._get_obs(), goal_diff)) + def set_context(self, context): + # context is 4 dimensional + qpos = self.init_qpos + qvel = self.init_qvel + qpos[-3:] = context[:3] + self.goal = context[-1] + self.set_state(qpos, qvel) + self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0]) + return self._get_obs() + class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): def __init__(self, @@ -246,10 +252,6 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): reset_noise_scale, exclude_current_positions_from_observation, max_episode_steps) def step(self, action): - print("") - print('height_scale: ', self.height_scale) - print('healthy_scale: ', self.healthy_scale) - print('dist_scale: ', self.dist_scale) self._floor_geom_id = self.model.geom_name2id('floor') self._foot_geom_id = self.model.geom_name2id('foot_geom') @@ -268,6 +270,23 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): reward = -ctrl_cost + healthy_reward + dist_reward done = False observation = self._get_obs() + + + ########################################################### + # This is only for logging the distance to goal when first having the contact + ########################################################## + floor_contact = self._contact_checker(self._floor_geom_id, + self._foot_geom_id) if not self.contact_with_floor else False + if not self.init_floor_contact: + self.init_floor_contact = floor_contact + if self.init_floor_contact and not self.has_left_floor: + self.has_left_floor = not floor_contact + if not self.contact_with_floor and self.has_left_floor: + self.contact_with_floor = floor_contact + + if self.contact_dist is None and self.contact_with_floor: + self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')] + - np.array([self.goal, 0, 0])) info = { 'height': height_after, 'x_pos': site_pos_after, @@ -275,8 +294,9 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): 'goal': self.goal, 'goal_dist': goal_dist, 'height_rew': self.max_height, - 'healthy_reward': self.healthy_reward * 2, - 'healthy': self.is_healthy + 'healthy_reward': self.healthy_reward * self.healthy_reward, + 'healthy': self.is_healthy, + 'contact_dist': self.contact_dist if self.contact_dist is not None else 0 } return observation, reward, done, info @@ -361,7 +381,7 @@ if __name__ == '__main__': # env = ALRHopperJumpRndmPosEnv() obs = env.reset() - for k in range(10): + for k in range(1000): obs = env.reset() print('observation :', obs[:]) for i in range(200): diff --git a/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py b/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py index 7a26d62..77a1bf6 100644 --- a/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py +++ b/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py @@ -38,4 +38,8 @@ class NewHighCtxtMPWrapper(NewMPWrapper): [False] * 6, # velocity [True], # goal [False] * 3 # goal diff - ]) \ No newline at end of file + ]) + + def set_context(self, context): + return self.get_observation_from_step(self.env.env.set_context(context)) + diff --git a/alr_envs/alr/mujoco/reacher/alr_reacher.py b/alr_envs/alr/mujoco/reacher/alr_reacher.py index 6b36407..c12352a 100644 --- a/alr_envs/alr/mujoco/reacher/alr_reacher.py +++ b/alr_envs/alr/mujoco/reacher/alr_reacher.py @@ -8,7 +8,8 @@ import alr_envs.utils.utils as alr_utils class ALRReacherEnv(MujocoEnv, utils.EzPickle): - def __init__(self, steps_before_reward=200, n_links=5, balance=False): + def __init__(self, steps_before_reward: int = 200, n_links: int = 5, ctrl_cost_weight: int = 1, + balance: bool = False): utils.EzPickle.__init__(**locals()) self._steps = 0 @@ -17,6 +18,7 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle): self.balance = balance self.balance_weight = 1.0 + self.ctrl_cost_weight = ctrl_cost_weight self.reward_weight = 1 if steps_before_reward == 200: @@ -40,7 +42,7 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle): angular_vel = 0.0 reward_balance = 0.0 is_delayed = self.steps_before_reward > 0 - reward_ctrl = - np.square(a).sum() + reward_ctrl = - np.square(a).sum() * self.ctrl_cost_weight if self._steps >= self.steps_before_reward: vec = self.get_body_com("fingertip") - self.get_body_com("target") reward_dist -= self.reward_weight * np.linalg.norm(vec) @@ -48,9 +50,9 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle): # avoid giving this penalty for normal step based case # angular_vel -= 10 * np.linalg.norm(self.sim.data.qvel.flat[:self.n_links]) angular_vel -= 10 * np.square(self.sim.data.qvel.flat[:self.n_links]).sum() - if is_delayed: - # Higher control penalty for sparse reward per timestep - reward_ctrl *= 10 + # if is_delayed: + # # Higher control penalty for sparse reward per timestep + # reward_ctrl *= 10 if self.balance: reward_balance -= self.balance_weight * np.abs( @@ -68,35 +70,42 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle): def viewer_setup(self): self.viewer.cam.trackbodyid = 0 - def reset_model(self): - qpos = self.init_qpos - if not hasattr(self, "goal"): - self.goal = np.array([-0.25, 0.25]) - # self.goal = self.init_qpos.copy()[:2] + 0.05 - qpos[-2:] = self.goal - qvel = self.init_qvel - qvel[-2:] = 0 - self.set_state(qpos, qvel) - self._steps = 0 - - return self._get_obs() - # def reset_model(self): - # qpos = self.init_qpos.copy() - # while True: - # self.goal = self.np_random.uniform(low=-self.n_links / 10, high=self.n_links / 10, size=2) - # # self.goal = self.np_random.uniform(low=0, high=self.n_links / 10, size=2) - # # self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=[0, self.n_links / 10], size=2) - # if np.linalg.norm(self.goal) < self.n_links / 10: - # break + # qpos = self.init_qpos + # if not hasattr(self, "goal"): + # self.goal = np.array([-0.25, 0.25]) + # # self.goal = self.init_qpos.copy()[:2] + 0.05 # qpos[-2:] = self.goal - # qvel = self.init_qvel.copy() + # qvel = self.init_qvel # qvel[-2:] = 0 # self.set_state(qpos, qvel) # self._steps = 0 # # return self._get_obs() + def reset_model(self): + qpos = self.init_qpos.copy() + while True: + # full space + # self.goal = self.np_random.uniform(low=-self.n_links / 10, high=self.n_links / 10, size=2) + # I Quadrant + # self.goal = self.np_random.uniform(low=0, high=self.n_links / 10, size=2) + # II Quadrant + # self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=[0, self.n_links / 10], size=2) + # II + III Quadrant + # self.goal = np.random.uniform(low=-self.n_links / 10, high=[0, self.n_links / 10], size=2) + # I + II Quadrant + self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=self.n_links, size=2) + if np.linalg.norm(self.goal) < self.n_links / 10: + break + qpos[-2:] = self.goal + qvel = self.init_qvel.copy() + qvel[-2:] = 0 + self.set_state(qpos, qvel) + self._steps = 0 + + return self._get_obs() + # def reset_model(self): # qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos # while True: @@ -140,4 +149,4 @@ if __name__ == '__main__': if d: env.reset() - env.close() + env.close() \ No newline at end of file diff --git a/alr_envs/alr/mujoco/reacher/mp_wrapper.py b/alr_envs/alr/mujoco/reacher/mp_wrapper.py index abcdc50..3b655d4 100644 --- a/alr_envs/alr/mujoco/reacher/mp_wrapper.py +++ b/alr_envs/alr/mujoco/reacher/mp_wrapper.py @@ -40,4 +40,4 @@ class MPWrapper(MPEnvWrapper): @property def dt(self) -> Union[float, int]: - return self.env.dt + return self.env.dt \ No newline at end of file diff --git a/alr_envs/utils/make_env_helpers.py b/alr_envs/utils/make_env_helpers.py index f22e0d7..301a5aa 100644 --- a/alr_envs/utils/make_env_helpers.py +++ b/alr_envs/utils/make_env_helpers.py @@ -98,7 +98,6 @@ def make(env_id: str, seed, **kwargs): return env - def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController, ep_wrapper_kwargs: Mapping, seed=1, **kwargs): """