fixing beerpong rewards

This commit is contained in:
Onur 2022-04-07 18:49:44 +02:00
parent 04b6b314cf
commit eb7dd3a18f
3 changed files with 94 additions and 56 deletions

View File

@ -430,29 +430,3 @@ for _v, cd in enumerate(ctxt_dim):
} }
) )
ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
# register(
# id='TableTennisProMP-v2',
# entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper',
# kwargs={
# "name": "alr_envs:TableTennis2DCtxt-v1",
# "wrappers": [mujoco.table_tennis.MPWrapper],
# "mp_kwargs": {
# "num_dof": 7,
# "num_basis": 2,
# "duration": 1.25,
# "post_traj_time": 4.5,
# #"width": 0.01,
# #"off": 0.01,
# "policy_type": "motor",
# "weights_scale": 1.0,
# "zero_start": True,
# "zero_goal": False,
# "policy_kwargs": {
# "p_gains": 0.5*np.array([1.0, 4.0, 2.0, 4.0, 1.0, 4.0, 1.0]),
# "d_gains": 0.5*np.array([0.1, 0.4, 0.2, 0.4, 0.1, 0.4, 0.1])
# }
# }
# }
# )
# ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("TableTennisProMP-v2")

View File

@ -7,21 +7,16 @@ from gym.envs.mujoco import MujocoEnv
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
# CUP_POS_MIN = np.array([-0.32, -2.2]) CUP_POS_MIN = np.array([-0.32, -2.2])
# CUP_POS_MAX = np.array([0.32, -1.2]) CUP_POS_MAX = np.array([0.32, -1.2])
CUP_POS_MIN = np.array([-1.42, -4.05])
CUP_POS_MAX = np.array([1.42, -1.25])
class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False,
rndm_goal=False, cup_goal_pos=None): rndm_goal=False, cup_goal_pos=None):
if cup_goal_pos is None: cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840])
cup_goal_pos = [-0.3, -1.2, 0.840] if cup_goal_pos.shape[0]==2:
elif len(cup_goal_pos)==2: cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840)
cup_goal_pos = np.array(cup_goal_pos)
cup_goal_pos = np.insert(cup_goal_pos, 2, 0.80)
self.cup_goal_pos = np.array(cup_goal_pos) self.cup_goal_pos = np.array(cup_goal_pos)
self._steps = 0 self._steps = 0
@ -52,7 +47,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
else: else:
self.noise_std = 0 self.noise_std = 0
reward_function = BeerPongReward reward_function = BeerPongReward
self.reward_function = reward_function() self.reward_function = reward_function()
@ -94,7 +88,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
start_pos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy() start_pos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy()
self.set_state(start_pos, init_vel) self.set_state(start_pos, init_vel)
if self.rndm_goal: if self.rndm_goal:
xy = np.random.uniform(CUP_POS_MIN, CUP_POS_MAX) xy = self.np_random.uniform(CUP_POS_MIN, CUP_POS_MAX)
xyz = np.zeros(3) xyz = np.zeros(3)
xyz[:2] = xy xyz[:2] = xy
xyz[-1] = 0.840 xyz[-1] = 0.840

View File

@ -54,6 +54,7 @@ class BeerPongReward:
self.ball_table_contact = False self.ball_table_contact = False
self.ball_wall_contact = False self.ball_wall_contact = False
self.ball_cup_contact = False self.ball_cup_contact = False
self.ball_in_cup = False
self.noisy_bp = noisy self.noisy_bp = noisy
self._t_min_final_dist = -1 self._t_min_final_dist = -1
@ -80,39 +81,94 @@ class BeerPongReward:
action_cost = np.sum(np.square(action)) action_cost = np.sum(np.square(action))
self.action_costs.append(action_cost) self.action_costs.append(action_cost)
# ##################### Reward function which forces to bounce once on the table (tanh) ########################
# if not self.ball_table_contact:
# self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id,
# self.table_collision_id)
if not self.ball_table_contact: # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id, # if env._steps == env.ep_length - 1 or self._is_collided:
self.table_collision_id) # min_dist = np.min(self.dists)
# final_dist = self.dists_final[-1]
#
# ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id,
# self.cup_table_collision_id)
#
# # encourage bounce before falling into cup
# if not ball_in_cup:
# if not self.ball_table_contact:
# reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
# else:
# reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
# else:
# if not self.ball_table_contact:
# reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1
# else:
# reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
#
# # reward = - 1 * cost - self.collision_penalty * int(self._is_collided)
# success = ball_in_cup
# crash = self._is_collided
# else:
# reward = - 1e-2 * action_cost
# success = False
# crash = False
# ################################################################################################################
# ##################### Reward function which does not force to bounce once on the table (tanh) ################
# self._check_contacts(env.sim)
# self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
# if env._steps == env.ep_length - 1 or self._is_collided:
# min_dist = np.min(self.dists)
# final_dist = self.dists_final[-1]
#
# # encourage bounce before falling into cup
# if not self.ball_in_cup:
# if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
# min_dist_coeff, final_dist_coeff, rew_offset = 0.2, 0.1, 0
# # reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
# else:
# min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, 0
# # reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
# else:
# min_dist_coeff, final_dist_coeff, rew_offset = 1, 2, 3
# # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
#
# reward = final_dist_coeff * (1 - np.tanh(0.5 * final_dist)) + min_dist_coeff * (1 - np.tanh(0.5 * min_dist)) \
# + rew_offset
# success = self.ball_in_cup
# crash = self._is_collided
# else:
# reward = - 1e-2 * action_cost
# success = False
# crash = False
# ################################################################################################################
# ##################### Reward function which does not force to bounce once on the table (quad dist) ############
self._check_contacts(env.sim)
self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
if env._steps == env.ep_length - 1 or self._is_collided: if env._steps == env.ep_length - 1 or self._is_collided:
min_dist = np.min(self.dists) min_dist = np.min(self.dists)
final_dist = self.dists_final[-1] final_dist = self.dists_final[-1]
ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id,
self.cup_table_collision_id)
# encourage bounce before falling into cup # encourage bounce before falling into cup
if not ball_in_cup: if not self.ball_in_cup:
if not self.ball_table_contact: if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist)) min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -4
else: else:
reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist)) min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -2
else: else:
if not self.ball_table_contact: min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, 0
reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1
else:
reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
# reward = - 1 * cost - self.collision_penalty * int(self._is_collided) reward = rew_offset - min_dist_coeff * min_dist**2 - final_dist_coeff * final_dist**2 - \
success = ball_in_cup 1e-4*np.mean(action_cost)
success = self.ball_in_cup
crash = self._is_collided crash = self._is_collided
else: else:
reward = - 1e-2 * action_cost reward = - 1e-2 * action_cost
success = False success = False
crash = False crash = False
# ################################################################################################################
infos = {} infos = {}
infos["success"] = success infos["success"] = success
@ -124,6 +180,20 @@ class BeerPongReward:
return reward, infos return reward, infos
def _check_contacts(self, sim):
if not self.ball_table_contact:
self.ball_table_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
self.table_collision_id)
if not self.ball_cup_contact:
self.ball_cup_contact = self._check_collision_with_set_of_objects(sim, self.ball_collision_id,
self.cup_collision_ids)
if not self.ball_wall_contact:
self.ball_wall_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
self.wall_collision_id)
if not self.ball_in_cup:
self.ball_in_cup = self._check_collision_single_objects(sim, self.ball_collision_id,
self.cup_table_collision_id)
def _check_collision_single_objects(self, sim, id_1, id_2): def _check_collision_single_objects(self, sim, id_1, id_2):
for coni in range(0, sim.data.ncon): for coni in range(0, sim.data.ncon):
con = sim.data.contact[coni] con = sim.data.contact[coni]