From a0af74358562b69ddf8c7de6f7d276d56433bdfd Mon Sep 17 00:00:00 2001 From: Maximilian Huettenrauch Date: Mon, 6 Dec 2021 13:43:45 +0100 Subject: [PATCH] updated table tennis and beerpong for promp usage --- alr_envs/alr/__init__.py | 64 +++++++++++++------ alr_envs/alr/classic_control/README.MD | 5 +- alr_envs/alr/mujoco/__init__.py | 2 +- alr_envs/alr/mujoco/beerpong/beerpong.py | 10 +-- .../alr/mujoco/beerpong/beerpong_reward.py | 2 + .../mujoco/beerpong/beerpong_reward_staged.py | 55 +++++++--------- .../alr/mujoco/beerpong/beerpong_simple.py | 11 ++-- alr_envs/alr/mujoco/table_tennis/tt_gym.py | 44 +++++++------ 8 files changed, 107 insertions(+), 86 deletions(-) diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index f835328..e2ba068 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -198,14 +198,19 @@ register( ## Table Tennis register(id='TableTennis2DCtxt-v0', - entry_point='alr_envs.alr.mujoco:TT_Env_Gym', + entry_point='alr_envs.alr.mujoco:TTEnvGym', max_episode_steps=MAX_EPISODE_STEPS, - kwargs={'ctxt_dim':2}) + kwargs={'ctxt_dim': 2}) + +register(id='TableTennis2DCtxt-v1', + entry_point='alr_envs.alr.mujoco:TTEnvGym', + max_episode_steps=1750, + kwargs={'ctxt_dim': 2, 'fixed_goal': True}) register(id='TableTennis4DCtxt-v0', - entry_point='alr_envs.alr.mujoco:TT_Env_Gym', + entry_point='alr_envs.alr.mujoco:TTEnvGym', max_episode_steps=MAX_EPISODE_STEPS, - kwargs={'ctxt_dim':4}) + kwargs={'ctxt_dim': 4}) ## BeerPong difficulties = ["simple", "intermediate", "hard", "hardest"] @@ -369,13 +374,10 @@ register( "mp_kwargs": { "num_dof": 7, "num_basis": 2, - "n_zero_bases": 2, - "duration": 0.5, - "post_traj_time": 2.5, - # "width": 0.01, - # "off": 0.01, + "duration": 1, + "post_traj_time": 2, "policy_type": "motor", - "weights_scale": 0.08, + "weights_scale": 0.2, "zero_start": True, "zero_goal": False, "policy_kwargs": { @@ -388,22 +390,46 @@ register( ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("BeerpongProMP-v0") ## Table Tennis +ctxt_dim = [2, 4] +for _v, cd in enumerate(ctxt_dim): + _env_id = f'TableTennisProMP-v{_v}' + register( + id=_env_id, + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": "alr_envs:TableTennis{}DCtxt-v0".format(cd), + "wrappers": [mujoco.table_tennis.MPWrapper], + "mp_kwargs": { + "num_dof": 7, + "num_basis": 2, + "duration": 1.25, + "post_traj_time": 4.5, + "policy_type": "motor", + "weights_scale": 1.0, + "zero_start": True, + "zero_goal": False, + "policy_kwargs": { + "p_gains": 0.5*np.array([1.0, 4.0, 2.0, 4.0, 1.0, 4.0, 1.0]), + "d_gains": 0.5*np.array([0.1, 0.4, 0.2, 0.4, 0.1, 0.4, 0.1]) + } + } + } + ) + ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) + register( - id='TableTennisProMP-v0', + id='TableTennisProMP-v2', entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', kwargs={ - "name": "alr_envs:TableTennis4DCtxt-v0", + "name": "alr_envs:TableTennis2DCtxt-v1", "wrappers": [mujoco.table_tennis.MPWrapper], "mp_kwargs": { "num_dof": 7, "num_basis": 2, - "n_zero_bases": 2, - "duration": 1.25, - "post_traj_time": 4.5, - # "width": 0.01, - # "off": 0.01, + "duration": 1., + "post_traj_time": 2.5, "policy_type": "motor", - "weights_scale": 1.0, + "weights_scale": 0.2, "zero_start": True, "zero_goal": False, "policy_kwargs": { @@ -413,4 +439,4 @@ register( } } ) -ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("TableTennisProMP-v0") +ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("TableTennisProMP-v2") diff --git a/alr_envs/alr/classic_control/README.MD b/alr_envs/alr/classic_control/README.MD index 0bd2f92..bd1b68b 100644 --- a/alr_envs/alr/classic_control/README.MD +++ b/alr_envs/alr/classic_control/README.MD @@ -13,9 +13,6 @@ |---|---|---|---|---| |`ViaPointReacherDMP-v0`| A DMP provides a trajectory for the `ViaPointReacher-v0` task. | 200 | 25 |`HoleReacherFixedGoalDMP-v0`| A DMP provides a trajectory for the `HoleReacher-v0` task with a fixed goal attractor. | 200 | 25 -|`HoleReacherDMP-v0`| A DMP provides a trajectory for the `HoleReacher-v0` task. The goal attractor needs to be learned. | 200 | 30 -|`ALRBallInACupSimpleDMP-v0`| A DMP provides a trajectory for the `ALRBallInACupSimple-v0` task where only 3 joints are actuated. | 4000 | 15 -|`ALRBallInACupDMP-v0`| A DMP provides a trajectory for the `ALRBallInACup-v0` task. | 4000 | 35 -|`ALRBallInACupGoalDMP-v0`| A DMP provides a trajectory for the `ALRBallInACupGoal-v0` task. | 4000 | 35 | 3 +|`HoleReacherDMP-v0`| A DMP provides a trajectory for the `HoleReacher-v0` task. The goal attractor needs to be learned. | 200 | 30 [//]: |`HoleReacherProMPP-v0`| \ No newline at end of file diff --git a/alr_envs/alr/mujoco/__init__.py b/alr_envs/alr/mujoco/__init__.py index 30e1e7c..cdb3cde 100644 --- a/alr_envs/alr/mujoco/__init__.py +++ b/alr_envs/alr/mujoco/__init__.py @@ -2,5 +2,5 @@ from .reacher.alr_reacher import ALRReacherEnv from .reacher.balancing import BalancingEnv from .ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv from .ball_in_a_cup.biac_pd import ALRBallInACupPDEnv -from .table_tennis.tt_gym import TT_Env_Gym +from .table_tennis.tt_gym import TTEnvGym from .beerpong.beerpong import ALRBeerBongEnv \ No newline at end of file diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py index a10e54a..a86f0a7 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong.py @@ -27,10 +27,10 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): self.ball_site_id = 0 self.ball_id = 11 - self._release_step = 100 # time step of ball release + self._release_step = 175 # time step of ball release - self.sim_time = 4 # seconds - self.ep_length = 600 # based on 5 seconds with dt = 0.005 int(self.sim_time / self.dt) + self.sim_time = 3 # seconds + self.ep_length = 600 # based on 3 seconds with dt = 0.005 int(self.sim_time / self.dt) self.cup_table_id = 10 if noisy: @@ -143,7 +143,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): q_vel=self.sim.data.qvel[0:7].ravel().copy(), ball_pos=ball_pos, ball_vel=ball_vel, - is_success=success, + success=success, is_collided=is_collided, sim_crash=crash) def check_traj_in_joint_limits(self): @@ -171,7 +171,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): if __name__ == "__main__": - env = ALRBeerBongEnv(reward_type="no_context", difficulty='hardest') + env = ALRBeerBongEnv(reward_type="staged", difficulty='hardest') # env.configure(ctxt) env.reset() diff --git a/alr_envs/alr/mujoco/beerpong/beerpong_reward.py b/alr_envs/alr/mujoco/beerpong/beerpong_reward.py index 3896e82..dc39ca8 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong_reward.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward.py @@ -71,6 +71,7 @@ class BeerPongReward: goal_pos = env.sim.data.site_xpos[self.goal_id] ball_pos = env.sim.data.body_xpos[self.ball_id] + ball_vel = env.sim.data.body_xvelp[self.ball_id] goal_final_pos = env.sim.data.site_xpos[self.goal_final_id] self.dists.append(np.linalg.norm(goal_pos - ball_pos)) self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) @@ -131,6 +132,7 @@ class BeerPongReward: infos["success"] = success infos["is_collided"] = self._is_collided infos["ball_pos"] = ball_pos.copy() + infos["ball_vel"] = ball_vel.copy() infos["action_cost"] = 5e-4 * action_cost return reward, infos diff --git a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py index 9d1d878..d64f179 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py @@ -81,32 +81,36 @@ class BeerPongReward: action_cost = np.sum(np.square(action)) self.action_costs.append(action_cost) + if not self.ball_table_contact: + self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id, + self.table_collision_id) + self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) if env._steps == env.ep_length - 1 or self._is_collided: min_dist = np.min(self.dists) - ball_table_bounce = self._check_collision_single_objects(env.sim, self.ball_collision_id, - self.table_collision_id) - ball_cup_table_cont = self._check_collision_with_set_of_objects(env.sim, self.ball_collision_id, - self.cup_collision_ids) - ball_wall_cont = self._check_collision_single_objects(env.sim, self.ball_collision_id, - self.wall_collision_id) + final_dist = self.dists_final[-1] + ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id, self.cup_table_collision_id) - if not ball_in_cup: - cost_offset = 2 - if not ball_cup_table_cont and not ball_table_bounce and not ball_wall_cont: - cost_offset += 2 - cost = cost_offset + min_dist ** 2 + 0.5 * self.dists_final[-1] ** 2 + 1e-7 * action_cost - else: - cost = self.dists_final[-1] ** 2 + 1.5 * action_cost * 1e-7 - reward = - 1 * cost - self.collision_penalty * int(self._is_collided) + # encourage bounce before falling into cup + if not ball_in_cup: + if not self.ball_table_contact: + reward = 0.2 * (1 - np.tanh(min_dist ** 2)) + 0.1 * (1 - np.tanh(final_dist ** 2)) + else: + reward = (1 - np.tanh(min_dist ** 2)) + 0.5 * (1 - np.tanh(final_dist ** 2)) + else: + if not self.ball_table_contact: + reward = 2 * (1 - np.tanh(final_dist ** 2)) + 1 * (1 - np.tanh(min_dist ** 2)) + 1 + else: + reward = 2 * (1 - np.tanh(final_dist ** 2)) + 1 * (1 - np.tanh(min_dist ** 2)) + 3 + + # reward = - 1 * cost - self.collision_penalty * int(self._is_collided) success = ball_in_cup crash = self._is_collided else: - reward = - 1e-7 * action_cost - cost = 0 + reward = - 1e-4 * action_cost success = False crash = False @@ -115,26 +119,11 @@ class BeerPongReward: infos["is_collided"] = self._is_collided infos["ball_pos"] = ball_pos.copy() infos["ball_vel"] = ball_vel.copy() - infos["action_cost"] = 5e-4 * action_cost - infos["task_cost"] = cost + infos["action_cost"] = action_cost + infos["task_reward"] = reward return reward, infos - def get_cost_offset(self): - if self.ball_ground_contact: - return 200 - - if not self.ball_table_contact: - return 100 - - if not self.ball_in_cup: - return 50 - - if self.ball_in_cup and self.ball_cup_contact and not self.noisy_bp: - return 10 - - return 0 - def _check_collision_single_objects(self, sim, id_1, id_2): for coni in range(0, sim.data.ncon): con = sim.data.contact[coni] diff --git a/alr_envs/alr/mujoco/beerpong/beerpong_simple.py b/alr_envs/alr/mujoco/beerpong/beerpong_simple.py index 73da83d..1708d38 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong_simple.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong_simple.py @@ -6,8 +6,6 @@ from gym.envs.mujoco import MujocoEnv class ALRBeerpongEnv(MujocoEnv, utils.EzPickle): def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None): - utils.EzPickle.__init__(**locals()) - self._steps = 0 self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", @@ -28,15 +26,13 @@ class ALRBeerpongEnv(MujocoEnv, utils.EzPickle): self.context = None - MujocoEnv.__init__(self, model_path=self.xml_path, frame_skip=n_substeps) - # alr_mujoco_env.AlrMujocoEnv.__init__(self, # self.xml_path, # apply_gravity_comp=apply_gravity_comp, # n_substeps=n_substeps) self.sim_time = 8 # seconds - self.sim_steps = int(self.sim_time / self.dt) + # self.sim_steps = int(self.sim_time / self.dt) if reward_function is None: from alr_envs.alr.mujoco.beerpong.beerpong_reward_simple import BeerpongReward reward_function = BeerpongReward @@ -46,6 +42,9 @@ class ALRBeerpongEnv(MujocoEnv, utils.EzPickle): self.cup_table_id = self.sim.model._body_name2id["cup_table"] # self.bounce_table_id = self.sim.model._body_name2id["bounce_table"] + MujocoEnv.__init__(self, model_path=self.xml_path, frame_skip=n_substeps) + utils.EzPickle.__init__(self) + @property def current_pos(self): return self.sim.data.qpos[0:7].copy() @@ -90,7 +89,7 @@ class ALRBeerpongEnv(MujocoEnv, utils.EzPickle): reward_ctrl = - np.square(a).sum() action_cost = np.sum(np.square(a)) - crash = self.do_simulation(a) + crash = self.do_simulation(a, self.frame_skip) joint_cons_viol = self.check_traj_in_joint_limits() self._q_pos.append(self.sim.data.qpos[0:7].ravel().copy()) diff --git a/alr_envs/alr/mujoco/table_tennis/tt_gym.py b/alr_envs/alr/mujoco/table_tennis/tt_gym.py index 635d49d..f42c4c7 100644 --- a/alr_envs/alr/mujoco/table_tennis/tt_gym.py +++ b/alr_envs/alr/mujoco/table_tennis/tt_gym.py @@ -10,7 +10,7 @@ from alr_envs.alr.mujoco.table_tennis.tt_reward import TT_Reward #TODO: Check for simulation stability. Make sure the code runs even for sim crash -MAX_EPISODE_STEPS = 1375 +MAX_EPISODE_STEPS = 2875 BALL_NAME_CONTACT = "target_ball_contact" BALL_NAME = "target_ball" TABLE_NAME = 'table_tennis_table' @@ -22,15 +22,20 @@ RACKET_NAME = 'bat' CONTEXT_RANGE_BOUNDS_2DIM = np.array([[-1.2, -0.6], [-0.2, 0.0]]) CONTEXT_RANGE_BOUNDS_4DIM = np.array([[-1.35, -0.75, -1.25, -0.75], [-0.1, 0.75, -0.1, 0.75]]) -class TT_Env_Gym(MujocoEnv, utils.EzPickle): - def __init__(self, ctxt_dim=2): +class TTEnvGym(MujocoEnv, utils.EzPickle): + + def __init__(self, ctxt_dim=2, fixed_goal=False): model_path = os.path.join(os.path.dirname(__file__), "xml", 'table_tennis_env.xml') self.ctxt_dim = ctxt_dim + self.fixed_goal = fixed_goal if ctxt_dim == 2: self.context_range_bounds = CONTEXT_RANGE_BOUNDS_2DIM - self.goal = np.zeros(3) # 2 x,y + 1z + if self.fixed_goal: + self.goal = np.array([-1, -0.1, 0]) + else: + self.goal = np.zeros(3) # 2 x,y + 1z elif ctxt_dim == 4: self.context_range_bounds = CONTEXT_RANGE_BOUNDS_4DIM self.goal = np.zeros(3) @@ -47,10 +52,10 @@ class TT_Env_Gym(MujocoEnv, utils.EzPickle): self.reward_func = TT_Reward(self.ctxt_dim) self.ball_landing_pos = None - self.hited_ball = False + self.hit_ball = False self.ball_contact_after_hit = False self._ids_set = False - super(TT_Env_Gym, self).__init__(model_path=model_path, frame_skip=1) + super(TTEnvGym, self).__init__(model_path=model_path, frame_skip=1) self.ball_id = self.sim.model._body_name2id[BALL_NAME] # find the proper -> not protected func. self.ball_contact_id = self.sim.model._geom_name2id[BALL_NAME_CONTACT] self.table_contact_id = self.sim.model._geom_name2id[TABLE_NAME] @@ -77,15 +82,18 @@ class TT_Env_Gym(MujocoEnv, utils.EzPickle): return obs def sample_context(self): - return np.random.uniform(self.context_range_bounds[0], self.context_range_bounds[1], size=self.ctxt_dim) + return self.np_random.uniform(self.context_range_bounds[0], self.context_range_bounds[1], size=self.ctxt_dim) def reset_model(self): self.set_state(self.init_qpos_tt, self.init_qvel_tt) # reset to initial sim state self.time_steps = 0 self.ball_landing_pos = None - self.hited_ball = False + self.hit_ball = False self.ball_contact_after_hit = False - self.goal = self.sample_context()[:2] + if self.fixed_goal: + self.goal = self.goal[:2] + else: + self.goal = self.sample_context()[:2] if self.ctxt_dim == 2: initial_ball_state = ball_init(random=False) # fixed velocity, fixed position elif self.ctxt_dim == 4: @@ -122,12 +130,12 @@ class TT_Env_Gym(MujocoEnv, utils.EzPickle): if not self._ids_set: self._set_ids() done = False - episode_end = False if self.time_steps+1