diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py index 51b3ed3..9adab60 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong.py @@ -5,7 +5,27 @@ import numpy as np from gym import utils from gym.envs.mujoco import MujocoEnv -from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward +from alr_envs.alr.mujoco.beerpong.deprecated.beerpong_reward_staged import BeerPongReward + +# XML Variables +ROBOT_COLLISION_OBJ = ["wrist_palm_link_convex_geom", + "wrist_pitch_link_convex_decomposition_p1_geom", + "wrist_pitch_link_convex_decomposition_p2_geom", + "wrist_pitch_link_convex_decomposition_p3_geom", + "wrist_yaw_link_convex_decomposition_p1_geom", + "wrist_yaw_link_convex_decomposition_p2_geom", + "forearm_link_convex_decomposition_p1_geom", + "forearm_link_convex_decomposition_p2_geom", + "upper_arm_link_convex_decomposition_p1_geom", + "upper_arm_link_convex_decomposition_p2_geom", + "shoulder_link_convex_decomposition_p1_geom", + "shoulder_link_convex_decomposition_p2_geom", + "shoulder_link_convex_decomposition_p3_geom", + "base_link_convex_geom", "table_contact_geom"] + +CUP_COLLISION_OBJ = ["cup_geom_table3", "cup_geom_table4", "cup_geom_table5", "cup_geom_table6", + "cup_geom_table7", "cup_geom_table8", "cup_geom_table9", "cup_geom_table10", + "cup_geom_table15", "cup_geom_table16", "cup_geom_table17", "cup_geom1_table8"] class BeerPongEnv(MujocoEnv, utils.EzPickle): @@ -28,11 +48,22 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): self.release_step = 100 # time step of ball release self.ep_length = 600 // frame_skip - self.reward_function = BeerPongReward() self.repeat_action = frame_skip self.model = None self.site_id = lambda x: self.model.site_name2id(x) self.body_id = lambda x: self.model.body_name2id(x) + self.geom_id = lambda x: self.model.geom_name2id(x) + + # for reward calculation + self.dists = [] + self.dists_final = [] + self.action_costs = [] + self.ball_ground_contact_first = False + self.ball_table_contact = False + self.ball_wall_contact = False + self.ball_cup_contact = False + self.ball_in_cup = False + self.dist_ground_cup = -1 # distance floor to cup if first floor contact MujocoEnv.__init__(self, self.xml_path, frame_skip=1) utils.EzPickle.__init__(self) @@ -46,7 +77,15 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): return self._start_vel def reset(self): - self.reward_function.reset() + self.dists = [] + self.dists_final = [] + self.action_costs = [] + self.ball_ground_contact_first = False + self.ball_table_contact = False + self.ball_wall_contact = False + self.ball_cup_contact = False + self.ball_in_cup = False + self.dist_ground_cup = -1 # distance floor to cup if first floor contact return super().reset() def reset_model(self): @@ -76,7 +115,6 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): applied_action = a + self.sim.data.qfrc_bias[:len(a)].copy() / self.model.actuator_gear[:, 0] try: self.do_simulation(applied_action, self.frame_skip) - self.reward_function.initialize(self) # self.reward_function.check_contacts(self.sim) # I assume this is not important? if self._steps < self.release_step: self.sim.data.qpos[7::] = self.sim.data.site_xpos[self.site_id("init_ball_pos"), :].copy() @@ -88,7 +126,7 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): ob = self._get_obs() if not crash: - reward, reward_infos = self.reward_function.compute_reward(self, applied_action) + reward, reward_infos = self._get_reward(applied_action) is_collided = reward_infos['is_collided'] done = is_collided or self._steps == self.ep_length - 1 self._steps += 1 @@ -119,13 +157,98 @@ class BeerPongEnv(MujocoEnv, utils.EzPickle): cup_goal_diff_final, cup_goal_diff_top, self.sim.model.body_pos[self.body_id("cup_table")][:2].copy(), - [self._steps], + # [self._steps], # Use TimeAwareObservation Wrapper instead .... ]) @property def dt(self): return super(BeerPongEnv, self).dt * self.repeat_action + def _get_reward(self, action): + goal_pos = self.data.get_site_xpos("cup_goal_table") + ball_pos = self.data.get_body_xpos("ball") + ball_vel = self.data.get_body_xvelp("ball") + goal_final_pos = self.data.get_site_xpos("cup_goal_final_table") + + self._check_contacts() + self.dists.append(np.linalg.norm(goal_pos - ball_pos)) + self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) + self.dist_ground_cup = np.linalg.norm(ball_pos - goal_pos) \ + if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup + action_cost = np.sum(np.square(action)) + self.action_costs.append(np.copy(action_cost)) + # # ##################### Reward function which does not force to bounce once on the table (quad dist) ######### + + # Is this needed? + # self._is_collided = self._check_collision_with_itself([self.geom_id(name) for name in CUP_COLLISION_OBJ]) + + if self._steps == self.ep_length - 1:# or self._is_collided: + min_dist = np.min(self.dists) + final_dist = self.dists_final[-1] + if self.ball_ground_contact_first: + min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 2, -4 + else: + if not self.ball_in_cup: + if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact: + min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -4 + else: + min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -2 + else: + min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0, 0 + action_cost = 1e-4 * np.mean(action_cost) + reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \ + action_cost - ground_contact_dist_coeff * self.dist_ground_cup ** 2 + # release step punishment + min_time_bound = 0.1 + max_time_bound = 1.0 + release_time = self.release_step * self.dt + release_time_rew = int(release_time < min_time_bound) * (-30 - 10 * (release_time - min_time_bound) ** 2) + \ + int(release_time > max_time_bound) * (-30 - 10 * (release_time - max_time_bound) ** 2) + reward += release_time_rew + success = self.ball_in_cup + else: + action_cost = 1e-2 * action_cost + reward = - action_cost + success = False + # ############################################################################################################## + infos = {"success": success, "ball_pos": ball_pos.copy(), + "ball_vel": ball_vel.copy(), "action_cost": action_cost, "task_reward": reward, + "table_contact_first": int(not self.ball_ground_contact_first), + "is_collided": False} # TODO: Check if is collided is needed + return reward, infos + + def _check_contacts(self): + if not self.ball_table_contact: + self.ball_table_contact = self._check_collision({self.geom_id("ball_geom")}, + {self.geom_id("table_contact_geom")}) + if not self.ball_cup_contact: + self.ball_cup_contact = self._check_collision({self.geom_id("ball_geom")}, + {self.geom_id(name) for name in CUP_COLLISION_OBJ}) + if not self.ball_wall_contact: + self.ball_wall_contact = self._check_collision({self.geom_id("ball_geom")}, + {self.geom_id("wall")}) + if not self.ball_in_cup: + self.ball_in_cup = self._check_collision({self.geom_id("ball_geom")}, + {self.geom_id("cup_base_table_contact")}) + if not self.ball_ground_contact_first: + if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact \ + and not self.ball_in_cup: + self.ball_ground_contact_first = self._check_collision({self.geom_id("ball_geom")}, + {self.geom_id("ground")}) + + # Checks if id_set1 has a collision with id_set2 + def _check_collision(self, id_set_1, id_set_2): + """ + If id_set_2 is set to None, it will check for a collision with itself (id_set_1). + """ + collision_id_set = id_set_2 - id_set_1 if id_set_2 is not None else id_set_1 + for coni in range(self.sim.data.ncon): + con = self.sim.data.contact[coni] + if ((con.geom1 in id_set_1 and con.geom2 in collision_id_set) or + (con.geom2 in id_set_1 and con.geom1 in collision_id_set)): + return True + return False + class BeerPongEnvFixedReleaseStep(BeerPongEnv): def __init__(self, frame_skip=2): @@ -190,6 +313,7 @@ class BeerPongEnvStepBasedEpisodicReward(BeerPongEnv): if __name__ == "__main__": env = BeerPongEnv(frame_skip=2) + env.seed(0) # env = ALRBeerBongEnvStepBased(frame_skip=2) # env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2) # env = ALRBeerBongEnvFixedReleaseStep(frame_skip=2) @@ -197,14 +321,14 @@ if __name__ == "__main__": env.reset() env.render("human") - for i in range(1500): - ac = 10 * env.action_space.sample() + for i in range(600): + # ac = 10 * env.action_space.sample() + ac = 0.05 * np.ones(7) obs, rew, d, info = env.step(ac) env.render("human") - print(env.dt) - print(rew) if d: + print('reward:', rew) print('RESETTING') env.reset() time.sleep(1)