diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index 1101020..9dc31ae 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -252,7 +252,8 @@ register( max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMP, kwargs={ "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP, - "context": False + "context": False, + "healthy_rewasrd": 1.0 } ) register( @@ -273,6 +274,17 @@ register( "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP } ) + +register( + id='ALRHopperJump-v3', + entry_point='alr_envs.alr.mujoco:ALRHopperXYJumpEnv', + max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMP, + kwargs={ + "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP, + "context": True, + "healthy_reward": 1.0 + } +) # CtxtFree are v0, Contextual are v1 register( id='ALRHopperJumpOnBox-v0', @@ -723,6 +735,46 @@ for _v in _versions: ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) +## AntJump +_versions = ["v0", "v1"] +for _v in _versions: + _env_id = f'ALRAntJumpProMP-{_v}' + register( + id= _env_id, + entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper', + kwargs={ + "name": f"alr_envs:ALRAntJump-{_v}", + "wrappers": [mujoco.ant_jump.NewMPWrapper], + "ep_wrapper_kwargs": { + "weight_scale": 1 + }, + "movement_primitives_kwargs": { + 'movement_primitives_type': 'promp', + 'action_dim': 8 + }, + "phase_generator_kwargs": { + 'phase_generator_type': 'linear', + 'delay': 0, + 'tau': 10, # initial value + 'learn_tau': False, + 'learn_delay': False + }, + "controller_kwargs": { + 'controller_type': 'motor', + "p_gains": np.ones(8), + "d_gains": 0.1*np.ones(8), + }, + "basis_generator_kwargs": { + 'basis_generator_type': 'zero_rbf', + 'num_basis': 5, + 'num_basis_zero_start': 2 + } + } + ) + ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) + + + ## HalfCheetahJump _versions = ["v0", "v1"] for _v in _versions: @@ -877,6 +929,42 @@ register( ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("ALRHopperJumpProMP-v2") + +## HopperJump +register( + id= "ALRHopperJumpProMP-v3", + entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper', + kwargs={ + "name": f"alr_envs:ALRHopperJump-v3", + "wrappers": [mujoco.hopper_jump.NewMPWrapper], + "ep_wrapper_kwargs": { + "weight_scale": 1 + }, + "movement_primitives_kwargs": { + 'movement_primitives_type': 'promp', + 'action_dim': 3 + }, + "phase_generator_kwargs": { + 'phase_generator_type': 'linear', + 'delay': 0, + 'tau': 2, # initial value + 'learn_tau': False, + 'learn_delay': False + }, + "controller_kwargs": { + 'controller_type': 'motor', + "p_gains": np.ones(3), + "d_gains": 0.1*np.ones(3), + }, + "basis_generator_kwargs": { + 'basis_generator_type': 'zero_rbf', + 'num_basis': 5, + 'num_basis_zero_start': 2 + } + } +) +ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("ALRHopperJumpProMP-v3") + ## HopperJumpOnBox _versions = ["v0", "v1"] for _v in _versions: diff --git a/alr_envs/alr/mujoco/__init__.py b/alr_envs/alr/mujoco/__init__.py index fb86186..5c04b03 100644 --- a/alr_envs/alr/mujoco/__init__.py +++ b/alr_envs/alr/mujoco/__init__.py @@ -5,7 +5,7 @@ from .table_tennis.tt_gym import TTEnvGym from .beerpong.beerpong import ALRBeerBongEnv from .ant_jump.ant_jump import ALRAntJumpEnv from .half_cheetah_jump.half_cheetah_jump import ALRHalfCheetahJumpEnv -from .hopper_jump.hopper_jump import ALRHopperJumpEnv, ALRHopperJumpRndmPosEnv +from .hopper_jump.hopper_jump import ALRHopperJumpEnv, ALRHopperJumpRndmPosEnv, ALRHopperXYJumpEnv from .hopper_jump.hopper_jump_on_box import ALRHopperJumpOnBoxEnv from .hopper_throw.hopper_throw import ALRHopperThrowEnv from .hopper_throw.hopper_throw_in_basket import ALRHopperThrowInBasketEnv diff --git a/alr_envs/alr/mujoco/ant_jump/__init__.py b/alr_envs/alr/mujoco/ant_jump/__init__.py index c5e6d2f..5d15867 100644 --- a/alr_envs/alr/mujoco/ant_jump/__init__.py +++ b/alr_envs/alr/mujoco/ant_jump/__init__.py @@ -1 +1,2 @@ from .mp_wrapper import MPWrapper +from .new_mp_wrapper import NewMPWrapper diff --git a/alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py b/alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py new file mode 100644 index 0000000..c33048f --- /dev/null +++ b/alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py @@ -0,0 +1,21 @@ +from alr_envs.mp.episodic_wrapper import EpisodicWrapper +from typing import Union, Tuple +import numpy as np + + + +class NewMPWrapper(EpisodicWrapper): + + def set_active_obs(self): + return np.hstack([ + [False] * 111, # ant has 111 dimensional observation space !! + [True] # goal height + ]) + + @property + def current_pos(self) -> Union[float, int, np.ndarray]: + return self.env.sim.data.qpos[7:15].copy() + + @property + def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: + return self.env.sim.data.qvel[6:14].copy() diff --git a/alr_envs/alr/mujoco/beerpong/assets/beerpong_wo_cup_big_table.xml b/alr_envs/alr/mujoco/beerpong/assets/beerpong_wo_cup_big_table.xml index 436b36c..78e2c32 100644 --- a/alr_envs/alr/mujoco/beerpong/assets/beerpong_wo_cup_big_table.xml +++ b/alr_envs/alr/mujoco/beerpong/assets/beerpong_wo_cup_big_table.xml @@ -144,7 +144,7 @@ solref="-10000 -100"/> - + diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py index 36998b9..017bfcd 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong.py @@ -7,8 +7,12 @@ from gym.envs.mujoco import MujocoEnv from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward -CUP_POS_MIN = np.array([-0.32, -2.2]) -CUP_POS_MAX = np.array([0.32, -1.2]) +CUP_POS_MIN = np.array([-1.42, -4.05]) +CUP_POS_MAX = np.array([1.42, -1.25]) + + +# CUP_POS_MIN = np.array([-0.32, -2.2]) +# CUP_POS_MAX = np.array([0.32, -1.2]) # smaller context space -> Easier task # CUP_POS_MIN = np.array([-0.16, -2.2]) @@ -24,8 +28,10 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): self.cup_goal_pos = np.array(cup_goal_pos) self._steps = 0 + # self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", + # "beerpong_wo_cup" + ".xml") self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", - "beerpong_wo_cup" + ".xml") + "beerpong_wo_cup_big_table" + ".xml") self.j_min = np.array([-2.6, -1.985, -2.8, -0.9, -4.55, -1.5707, -2.7]) self.j_max = np.array([2.6, 1.985, 2.8, 3.14159, 1.25, 1.5707, 2.7]) @@ -100,10 +106,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): return self._get_obs() def step(self, a): - # if a.shape[0] == 8: # we learn also when to release the ball - # self._release_step = a[-1] - # self._release_step = np.clip(self._release_step, 50, 250) - # self.release_step = 0.5/self.dt reward_dist = 0.0 angular_vel = 0.0 applied_action = a @@ -170,16 +172,11 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): [self._steps], ]) - # TODO - @property - def active_obs(self): - return np.hstack([ - [False] * 7, # cos - [False] * 7, # sin - [True] * 2, # xy position of cup - [False] # env steps - ]) +class ALRBeerPongStepEnv(ALRBeerBongEnv): + def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, + rndm_goal=False, cup_goal_pos=None): + super(ALRBeerPongStepEnv, self).__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) if __name__ == "__main__": env = ALRBeerBongEnv(rndm_goal=True) diff --git a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py index bb5dd3f..cb75127 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py @@ -40,6 +40,7 @@ class BeerPongReward: self.cup_z_axes = None self.collision_penalty = 500 self.reset(None) + self.is_initialized = False def reset(self, noisy): self.ball_traj = [] @@ -55,113 +56,60 @@ class BeerPongReward: self.ball_wall_contact = False self.ball_cup_contact = False self.ball_in_cup = False + self.dist_ground_cup = -1 # distance floor to cup if first floor contact self.noisy_bp = noisy self._t_min_final_dist = -1 def compute_reward(self, env, action): - self.ball_id = env.sim.model._body_name2id["ball"] - self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"] - self.goal_id = env.sim.model._site_name2id["cup_goal_table"] - self.goal_final_id = env.sim.model._site_name2id["cup_goal_final_table"] - self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects] - self.cup_table_id = env.sim.model._body_name2id["cup_table"] - self.table_collision_id = env.sim.model._geom_name2id["table_contact_geom"] - self.wall_collision_id = env.sim.model._geom_name2id["wall"] - self.cup_table_collision_id = env.sim.model._geom_name2id["cup_base_table_contact"] - self.init_ball_pos_site_id = env.sim.model._site_name2id["init_ball_pos_site"] - self.ground_collision_id = env.sim.model._geom_name2id["ground"] - self.robot_collision_ids = [env.sim.model._geom_name2id[name] for name in self.robot_collision_objects] + + if not self.is_initialized: + self.is_initialized = True + self.ball_id = env.sim.model._body_name2id["ball"] + self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"] + self.goal_id = env.sim.model._site_name2id["cup_goal_table"] + self.goal_final_id = env.sim.model._site_name2id["cup_goal_final_table"] + self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects] + self.cup_table_id = env.sim.model._body_name2id["cup_table"] + self.table_collision_id = env.sim.model._geom_name2id["table_contact_geom"] + self.wall_collision_id = env.sim.model._geom_name2id["wall"] + self.cup_table_collision_id = env.sim.model._geom_name2id["cup_base_table_contact"] + self.init_ball_pos_site_id = env.sim.model._site_name2id["init_ball_pos_site"] + self.ground_collision_id = env.sim.model._geom_name2id["ground"] + self.robot_collision_ids = [env.sim.model._geom_name2id[name] for name in self.robot_collision_objects] goal_pos = env.sim.data.site_xpos[self.goal_id] ball_pos = env.sim.data.body_xpos[self.ball_id] ball_vel = env.sim.data.body_xvelp[self.ball_id] goal_final_pos = env.sim.data.site_xpos[self.goal_final_id] + + self._check_contacts(env.sim) self.dists.append(np.linalg.norm(goal_pos - ball_pos)) self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) - + self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \ + if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup action_cost = np.sum(np.square(action)) self.action_costs.append(action_cost) - # ##################### Reward function which forces to bounce once on the table (tanh) ######################## - # if not self.ball_table_contact: - # self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id, - # self.table_collision_id) - # - # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) - # if env._steps == env.ep_length - 1 or self._is_collided: - # min_dist = np.min(self.dists) - # final_dist = self.dists_final[-1] - # - # ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id, - # self.cup_table_collision_id) - # - # # encourage bounce before falling into cup - # if not ball_in_cup: - # if not self.ball_table_contact: - # reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist)) - # else: - # reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist)) - # else: - # if not self.ball_table_contact: - # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1 - # else: - # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3 - # - # # reward = - 1 * cost - self.collision_penalty * int(self._is_collided) - # success = ball_in_cup - # crash = self._is_collided - # else: - # reward = - 1e-2 * action_cost - # success = False - # crash = False - # ################################################################################################################ - - ##################### Reward function which does not force to bounce once on the table (tanh) ################ - # self._check_contacts(env.sim) - # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) - # if env._steps == env.ep_length - 1 or self._is_collided: - # min_dist = np.min(self.dists) - # final_dist = self.dists_final[-1] - # - # # encourage bounce before falling into cup - # if not self.ball_in_cup: - # if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact: - # min_dist_coeff, final_dist_coeff, rew_offset = 0.2, 0.1, 0 - # # reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist)) - # else: - # min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, 0 - # # reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist)) - # else: - # min_dist_coeff, final_dist_coeff, rew_offset = 1, 2, 3 - # # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3 - # - # reward = final_dist_coeff * (1 - np.tanh(0.5 * final_dist)) + min_dist_coeff * (1 - np.tanh(0.5 * min_dist)) \ - # + rew_offset - # success = self.ball_in_cup - # crash = self._is_collided - # else: - # reward = - 1e-2 * action_cost - # success = False - # crash = False - ################################################################################################################ - # # ##################### Reward function which does not force to bounce once on the table (quad dist) ############ - self._check_contacts(env.sim) + self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) + if env._steps == env.ep_length - 1 or self._is_collided: min_dist = np.min(self.dists) final_dist = self.dists_final[-1] - # if self.ball_ground_contact_first: - # min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -6 - # else: - if not self.ball_in_cup: - if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact: - min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -4 - else: - min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -2 + if self.ball_ground_contact_first: + min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 2, -4 # relative rew offset when first bounding on ground + # min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -6 # absolute rew offset when first bouncing on ground else: - min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, 0 + if not self.ball_in_cup: + if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact: + min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -4 + else: + min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -2 + else: + min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0 + # dist_ground_cup = 1 * self.dist_ground_cup reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \ - 1e-4 * np.mean(action_cost) + 1e-4 * np.mean(action_cost) - ground_contact_dist_coeff*self.dist_ground_cup ** 2 # 1e-7*np.mean(action_cost) # release step punishment min_time_bound = 0.1 @@ -172,43 +120,13 @@ class BeerPongReward: reward += release_time_rew success = self.ball_in_cup # print('release time :', release_time) + # print('dist_ground_cup :', dist_ground_cup) else: reward = - 1e-2 * action_cost # reward = - 1e-4 * action_cost # reward = 0 success = False # ################################################################################################################ - - # # # ##################### Reward function which does not force to bounce once on the table (quad dist) ############ - # self._check_contacts(env.sim) - # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) - # if env._steps == env.ep_length - 1 or self._is_collided: - # min_dist = np.min(self.dists) - # final_dist = self.dists_final[-1] - # - # if not self.ball_in_cup: - # if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact: - # min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -6 - # else: - # if self.ball_ground_contact_first: - # min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -4 - # else: - # min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -2 - # else: - # if self.ball_ground_contact_first: - # min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, -1 - # else: - # min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, 0 - # reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \ - # 1e-7 * np.mean(action_cost) - # # 1e-4*np.mean(action_cost) - # success = self.ball_in_cup - # else: - # # reward = - 1e-2 * action_cost - # # reward = - 1e-4 * action_cost - # reward = 0 - # success = False - # ################################################################################################################ infos = {} infos["success"] = success infos["is_collided"] = self._is_collided diff --git a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py index 7c4ee9d..9683687 100644 --- a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py +++ b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py @@ -44,6 +44,12 @@ class NewMPWrapper(EpisodicWrapper): else: return action, None + def set_context(self, context): + xyz = np.zeros(3) + xyz[:2] = context + xyz[-1] = 0.840 + self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz + return self.get_observation_from_step(self.env.env._get_obs()) # def set_action_space(self): # if self.mp.learn_tau: # min_action_bounds, max_action_bounds = self.mp.get_param_bounds() diff --git a/alr_envs/alr/mujoco/hopper_jump/assets/hopper.xml b/alr_envs/alr/mujoco/hopper_jump/assets/hopper_jump.xml similarity index 92% rename from alr_envs/alr/mujoco/hopper_jump/assets/hopper.xml rename to alr_envs/alr/mujoco/hopper_jump/assets/hopper_jump.xml index f18bc46..3348bab 100644 --- a/alr_envs/alr/mujoco/hopper_jump/assets/hopper.xml +++ b/alr_envs/alr/mujoco/hopper_jump/assets/hopper_jump.xml @@ -25,12 +25,16 @@ + + + + diff --git a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py index 9a74bb0..979e224 100644 --- a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py +++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py @@ -1,5 +1,6 @@ from gym.envs.mujoco.hopper_v3 import HopperEnv import numpy as np +import os MAX_EPISODE_STEPS_HOPPERJUMP = 250 @@ -14,13 +15,13 @@ class ALRHopperJumpEnv(HopperEnv): """ def __init__(self, - xml_file='hopper.xml', + xml_file='hopper_jump.xml', forward_reward_weight=1.0, ctrl_cost_weight=1e-3, healthy_reward=0.0, penalty=0.0, context=True, - terminate_when_unhealthy=True, + terminate_when_unhealthy=False, healthy_state_range=(-100.0, 100.0), healthy_z_range=(0.5, float('inf')), healthy_angle_range=(-float('inf'), float('inf')), @@ -34,6 +35,13 @@ class ALRHopperJumpEnv(HopperEnv): self.goal = 0 self.context = context self.exclude_current_positions_from_observation = exclude_current_positions_from_observation + self._floor_geom_id = None + self._foot_geom_id = None + self.contact_with_floor = False + self.init_floor_contact = False + self.has_left_floor = False + self.contact_dist = None + xml_file = os.path.join(os.path.dirname(__file__), "assets", xml_file) super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, terminate_when_unhealthy, healthy_state_range, healthy_z_range, healthy_angle_range, reset_noise_scale, exclude_current_positions_from_observation) @@ -43,28 +51,36 @@ class ALRHopperJumpEnv(HopperEnv): self.current_step += 1 self.do_simulation(action, self.frame_skip) height_after = self.get_body_com("torso")[2] + site_pos_after = self.sim.data.site_xpos[self.model.site_name2id('foot_site')].copy() self.max_height = max(height_after, self.max_height) ctrl_cost = self.control_cost(action) costs = ctrl_cost done = False - + rewards = 0 if self.current_step >= self.max_episode_steps: hight_goal_distance = -10*np.linalg.norm(self.max_height - self.goal) if self.context else self.max_height - healthy_reward = 0 if self.context else self.healthy_reward * self.current_step + healthy_reward = 0 if self.context else self.healthy_reward * 2 # self.current_step height_reward = self._forward_reward_weight * hight_goal_distance # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley rewards = height_reward + healthy_reward - else: - # penalty for wrong start direction of first two joints; not needed, could be removed - rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0 + # else: + # # penalty for wrong start direction of first two joints; not needed, could be removed + # rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0 observation = self._get_obs() reward = rewards - costs + # info = { + # 'height' : height_after, + # 'max_height': self.max_height, + # 'goal' : self.goal + # } info = { - 'height' : height_after, + 'height': height_after, + 'x_pos': site_pos_after, 'max_height': self.max_height, - 'goal' : self.goal + 'height_rew': self.max_height, + 'healthy_reward': self.healthy_reward * 2 } return observation, reward, done, info @@ -73,7 +89,7 @@ class ALRHopperJumpEnv(HopperEnv): return np.append(super()._get_obs(), self.goal) def reset(self): - self.goal = np.random.uniform(1.4, 2.3, 1) # 1.3 2.3 + self.goal = np.random.uniform(1.4, 2.16, 1) # 1.3 2.3 self.max_height = 0 self.current_step = 0 return super().reset() @@ -87,14 +103,124 @@ class ALRHopperJumpEnv(HopperEnv): self.set_state(qpos, qvel) observation = self._get_obs() + self.has_left_floor = False + self.contact_with_floor = False + self.init_floor_contact = False + self.contact_dist = None return observation + def _contact_checker(self, id_1, id_2): + for coni in range(0, self.sim.data.ncon): + con = self.sim.data.contact[coni] + collision = con.geom1 == id_1 and con.geom2 == id_2 + collision_trans = con.geom1 == id_2 and con.geom2 == id_1 + if collision or collision_trans: + return True + return False + + +class ALRHopperXYJumpEnv(ALRHopperJumpEnv): + # The goal here is the desired x-Position of the Torso + + def step(self, action): + + self._floor_geom_id = self.model.geom_name2id('floor') + self._foot_geom_id = self.model.geom_name2id('foot_geom') + + self.current_step += 1 + self.do_simulation(action, self.frame_skip) + height_after = self.get_body_com("torso")[2] + site_pos_after = self.sim.data.site_xpos[self.model.site_name2id('foot_site')].copy() + self.max_height = max(height_after, self.max_height) + + # floor_contact = self._contact_checker(self._floor_geom_id, self._foot_geom_id) if not self.contact_with_floor else False + # self.init_floor_contact = floor_contact if not self.init_floor_contact else self.init_floor_contact + # self.has_left_floor = not floor_contact if self.init_floor_contact and not self.has_left_floor else self.has_left_floor + # self.contact_with_floor = floor_contact if not self.contact_with_floor and self.has_left_floor else self.contact_with_floor + + floor_contact = self._contact_checker(self._floor_geom_id, + self._foot_geom_id) if not self.contact_with_floor else False + if not self.init_floor_contact: + self.init_floor_contact = floor_contact + if self.init_floor_contact and not self.has_left_floor: + self.has_left_floor = not floor_contact + if not self.contact_with_floor and self.has_left_floor: + self.contact_with_floor = floor_contact + + if self.contact_dist is None and self.contact_with_floor: + self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')] + - np.array([self.goal, 0, 0], dtype=object))[0] + + ctrl_cost = self.control_cost(action) + costs = ctrl_cost + done = False + goal_dist = 0 + rewards = 0 + if self.current_step >= self.max_episode_steps: + # healthy_reward = 0 if self.context else self.healthy_reward * self.current_step + healthy_reward = self.healthy_reward * 2 #* self.current_step + goal_dist = np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object))[0] + contact_dist = self.contact_dist if self.contact_dist is not None else 5 + dist_reward = self._forward_reward_weight * (-3*goal_dist + 10*self.max_height - 2*contact_dist) + rewards = dist_reward + healthy_reward + + # else: + ## penalty for wrong start direction of first two joints; not needed, could be removed + # rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0 + + observation = self._get_obs() + reward = rewards - costs + info = { + 'height': height_after, + 'x_pos': site_pos_after, + 'max_height': self.max_height, + 'goal': self.goal, + 'dist_rew': goal_dist, + 'height_rew': self.max_height, + 'healthy_reward': self.healthy_reward * 2 + } + + return observation, reward, done, info + + def reset_model(self): + self.init_qpos[1] = 1.5 + self._floor_geom_id = self.model.geom_name2id('floor') + self._foot_geom_id = self.model.geom_name2id('foot_geom') + noise_low = -np.zeros(self.model.nq) + noise_low[3] = -0.5 + noise_low[4] = -0.2 + noise_low[5] = 0 + + noise_high = np.zeros(self.model.nq) + noise_high[3] = 0 + noise_high[4] = 0 + noise_high[5] = 0.785 + + rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) + qpos = self.init_qpos + rnd_vec + qvel = self.init_qvel + + self.set_state(qpos, qvel) + + observation = self._get_obs() + self.has_left_floor = False + self.contact_with_floor = False + self.init_floor_contact = False + self.contact_dist = None + + return observation + + def reset(self): + super().reset() + # self.goal = np.random.uniform(-1.5, 1.5, 1) + self.goal = np.random.uniform(0, 1.5, 1) + # self.goal = np.array([1.5]) + self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0], dtype=object) + return self.reset_model() + class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv): def __init__(self, max_episode_steps=250): - self.contact_with_floor = False - self._floor_geom_id = None - self._foot_geom_id = None super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False, reset_noise_scale=5e-1, max_episode_steps=max_episode_steps) @@ -104,17 +230,17 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv): self._foot_geom_id = self.model.geom_name2id('foot_geom') noise_low = -np.ones(self.model.nq)*self._reset_noise_scale noise_low[1] = 0 - noise_low[2] = -0.3 - noise_low[3] = -0.1 - noise_low[4] = -1.1 - noise_low[5] = -0.785 + noise_low[2] = 0 + noise_low[3] = -0.2 + noise_low[4] = -0.2 + noise_low[5] = -0.1 noise_high = np.ones(self.model.nq)*self._reset_noise_scale noise_high[1] = 0 - noise_high[2] = 0.3 + noise_high[2] = 0 noise_high[3] = 0 noise_high[4] = 0 - noise_high[5] = 0.785 + noise_high[5] = 0.1 rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq) # rnd_vec[2] *= 0.05 # the angle around the y axis shouldn't be too high as the agent then falls down quickly and @@ -162,24 +288,18 @@ class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv): return observation, reward, done, info - def _contact_checker(self, id_1, id_2): - for coni in range(0, self.sim.data.ncon): - con = self.sim.data.contact[coni] - collision = con.geom1 == id_1 and con.geom2 == id_2 - collision_trans = con.geom1 == id_2 and con.geom2 == id_1 - if collision or collision_trans: - return True - return False + if __name__ == '__main__': render_mode = "human" # "human" or "partial" or "final" # env = ALRHopperJumpEnv() - env = ALRHopperJumpRndmPosEnv() + env = ALRHopperXYJumpEnv() + # env = ALRHopperJumpRndmPosEnv() obs = env.reset() for k in range(10): obs = env.reset() - print('observation :', obs[:6]) + print('observation :', obs[:]) for i in range(200): # objective.load_result("/tmp/cma") # test with random actions diff --git a/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py b/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py index 9932403..31be6be 100644 --- a/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py +++ b/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py @@ -12,9 +12,19 @@ class NewMPWrapper(EpisodicWrapper): def current_vel(self) -> Union[float, int, np.ndarray, Tuple]: return self.env.sim.data.qvel[3:6].copy() + # # random goal + # def set_active_obs(self): + # return np.hstack([ + # [False] * (5 + int(not self.env.exclude_current_positions_from_observation)), # position + # [False] * 6, # velocity + # [True] + # ]) + + # Random x goal + random init pos def set_active_obs(self): return np.hstack([ - [False] * (5 + int(not self.env.exclude_current_positions_from_observation)), # position + [False] * (2 + int(not self.env.exclude_current_positions_from_observation)), # position + [True] * 3, # set to true if randomize initial pos [False] * 6, # velocity [True] ]) diff --git a/alr_envs/mp/episodic_wrapper.py b/alr_envs/mp/episodic_wrapper.py index 7426c15..21fb035 100644 --- a/alr_envs/mp/episodic_wrapper.py +++ b/alr_envs/mp/episodic_wrapper.py @@ -205,7 +205,7 @@ class EpisodicWrapper(gym.Env, ABC): infos['step_actions'] = actions[:t + 1] infos['step_observations'] = observations[:t + 1] infos['step_rewards'] = rewards[:t + 1] - infos['trajectory_length'] = t + 1 + infos['trajectory_length'] = t + 1 done = True return self.get_observation_from_step(obs), trajectory_return, done, infos