diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index 978e85c..bf2ae74 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -253,7 +253,7 @@ register( kwargs={ "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP, "context": False, - "healthy_rewasrd": 1.0 + "healthy_reward": 1.0 } ) register( @@ -405,7 +405,7 @@ register(id='TableTennis2DCtxt-v1', kwargs={'ctxt_dim': 2, 'fixed_goal': True}) register(id='TableTennis4DCtxt-v0', - entry_point='alr_envs.alr.mujoco:TTEnvGym', + entry_point='alr_envs.alr.mujocco:TTEnvGym', max_episode_steps=MAX_EPISODE_STEPS, kwargs={'ctxt_dim': 4}) @@ -915,7 +915,7 @@ for _v in _versions: "basis_generator_kwargs": { 'basis_generator_type': 'zero_rbf', 'num_basis': 5, - 'num_basis_zero_start': 2 + 'num_basis_zero_start': 1 } } ) diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py index 0678da6..c72220b 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong.py @@ -187,35 +187,65 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): class ALRBeerBongEnvStepBased(ALRBeerBongEnv): + def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): + super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) + self.release_step = 62 # empirically evaluated for frame_skip=2! - def _set_action_space(self): - bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space() - min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype) - max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype) - self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype) - return self.action_space + # def _set_action_space(self): + # bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space() + # min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype) + # max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype) + # self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype) + # return self.action_space + + # def step(self, a): + # self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step + # return super(ALRBeerBongEnvStepBased, self).step(a[1:]) + # + # def reset(self): + # ob = super(ALRBeerBongEnvStepBased, self).reset() + # self.release_step = self.ep_length + 1 + # return ob def step(self, a): - self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step - return super(ALRBeerBongEnvStepBased, self).step(a[1:]) + if self._steps < self.release_step: + return super(ALRBeerBongEnvStepBased, self).step(a) + else: + reward = 0 + done = False + while not done: + sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBased, self).step(np.zeros(a.shape)) + if not done or sub_infos['sim_crash']: + reward += sub_reward + else: + ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy() + cup_goal_dist_final = np.linalg.norm(ball_pos - self.sim.data.site_xpos[ + self.sim.model._site_name2id["cup_goal_final_table"]].copy()) + cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[ + self.sim.model._site_name2id["cup_goal_table"]].copy()) + if sub_infos['success']: + dist_rew = -cup_goal_dist_final**2 + else: + dist_rew = -0.5*cup_goal_dist_final**2 - cup_goal_dist_top**2 + reward = reward - sub_infos['action_cost'] + dist_rew + infos = sub_infos + ob = sub_ob + return ob, reward, done, infos + - def reset(self): - ob = super(ALRBeerBongEnvStepBased, self).reset() - self.release_step = self.ep_length + 1 - return ob if __name__ == "__main__": # env = ALRBeerBongEnv(rndm_goal=True) - env = ALRBeerBongEnvStepBased(rndm_goal=True) + env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True) import time env.reset() env.render("human") for i in range(1500): - # ac = 10 * env.action_space.sample()[0:7] - ac = np.zeros(8) - ac[0] = -1 - if env._steps > 150: - ac[0] = 1 + ac = 10 * env.action_space.sample() + # ac = np.zeros(7) + # ac[0] = -1 + # if env._steps > 150: + # ac[0] = 1 obs, rew, d, info = env.step(ac) env.render("human") print(env.dt) diff --git a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py index 3ef8b7b..4629461 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py @@ -90,7 +90,7 @@ class BeerPongReward: self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \ if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup action_cost = np.sum(np.square(action)) - self.action_costs.append(action_cost) + self.action_costs.append(np.copy(action_cost)) # # ##################### Reward function which does not force to bounce once on the table (quad dist) ############ self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) @@ -110,8 +110,9 @@ class BeerPongReward: else: min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0 # dist_ground_cup = 1 * self.dist_ground_cup + action_cost = 1e-4 * np.mean(action_cost) reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \ - 1e-4 * np.mean(action_cost) - ground_contact_dist_coeff*self.dist_ground_cup ** 2 + action_cost - ground_contact_dist_coeff*self.dist_ground_cup ** 2 # 1e-7*np.mean(action_cost) # release step punishment min_time_bound = 0.1 @@ -124,7 +125,8 @@ class BeerPongReward: # print('release time :', release_time) # print('dist_ground_cup :', dist_ground_cup) else: - reward = - 1e-2 * action_cost + action_cost = 1e-2 * action_cost + reward = - action_cost # reward = - 1e-4 * action_cost # reward = 0 success = False diff --git a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py index 6bbbb5b..afedf4e 100644 --- a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py +++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py @@ -80,7 +80,8 @@ class ALRHopperJumpEnv(HopperEnv): 'x_pos': site_pos_after, 'max_height': self.max_height, 'height_rew': self.max_height, - 'healthy_reward': self.healthy_reward * 2 + 'healthy_reward': self.healthy_reward * 2, + 'healthy': self.is_healthy } return observation, reward, done, info @@ -171,7 +172,8 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv): 'goal': self.goal, 'goal_dist': goal_dist, 'height_rew': self.max_height, - 'healthy_reward': self.healthy_reward * 2 + 'healthy_reward': self.healthy_reward * 2, + 'healthy': self.is_healthy } return observation, reward, done, info @@ -207,13 +209,38 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv): super().reset() # self.goal = np.random.uniform(-1.5, 1.5, 1) # self.goal = np.random.uniform(0, 1.5, 1) - self.goal = self.np_random.uniform(0, 1.5, 1) - # self.goal = np.array([1.5]) + # self.goal = self.np_random.uniform(0, 1.5, 1) + self.goal = self.np_random.uniform(0.3, 1.35, 1) self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0], dtype=object) return self.reset_model() class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): + + def __init__(self, + xml_file='hopper_jump.xml', + forward_reward_weight=1.0, + ctrl_cost_weight=1e-3, + healthy_reward=0.0, + penalty=0.0, + context=True, + terminate_when_unhealthy=False, + healthy_state_range=(-100.0, 100.0), + healthy_z_range=(0.5, float('inf')), + healthy_angle_range=(-float('inf'), float('inf')), + reset_noise_scale=5e-3, + exclude_current_positions_from_observation=True, + max_episode_steps=250, + height_scale = 10, + dist_scale = 3, + healthy_scale = 2): + self.height_scale = height_scale + self.dist_scale = dist_scale + self.healthy_scale = healthy_scale + super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, penalty, context, + terminate_when_unhealthy, healthy_state_range, healthy_z_range, healthy_angle_range, + reset_noise_scale, exclude_current_positions_from_observation, max_episode_steps) + def step(self, action): self._floor_geom_id = self.model.geom_name2id('floor') @@ -226,10 +253,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): self.max_height = max(height_after, self.max_height) ctrl_cost = self.control_cost(action) - healthy_reward = self.healthy_reward * 2 - height_reward = 10*height_after + healthy_reward = self.healthy_reward * self.healthy_scale + height_reward = self.height_scale*height_after goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0] - goal_dist_reward = -3*goal_dist + goal_dist_reward = -self.dist_scale*goal_dist dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward) reward = -ctrl_cost + healthy_reward + dist_reward done = False @@ -240,12 +267,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): 'max_height': self.max_height, 'goal': self.goal, 'goal_dist': goal_dist, - 'dist_rew': dist_reward, - 'height_rew': height_reward, - 'healthy_reward': healthy_reward, - 'ctrl_reward': -ctrl_cost + 'height_rew': self.max_height, + 'healthy_reward': self.healthy_reward * 2, + 'healthy': self.is_healthy } - return observation, reward, done, info class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):