fixing beerpong rewards
This commit is contained in:
parent
04b6b314cf
commit
eb7dd3a18f
@ -429,30 +429,4 @@ for _v, cd in enumerate(ctxt_dim):
|
||||
}
|
||||
}
|
||||
)
|
||||
ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
|
||||
|
||||
# register(
|
||||
# id='TableTennisProMP-v2',
|
||||
# entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper',
|
||||
# kwargs={
|
||||
# "name": "alr_envs:TableTennis2DCtxt-v1",
|
||||
# "wrappers": [mujoco.table_tennis.MPWrapper],
|
||||
# "mp_kwargs": {
|
||||
# "num_dof": 7,
|
||||
# "num_basis": 2,
|
||||
# "duration": 1.25,
|
||||
# "post_traj_time": 4.5,
|
||||
# #"width": 0.01,
|
||||
# #"off": 0.01,
|
||||
# "policy_type": "motor",
|
||||
# "weights_scale": 1.0,
|
||||
# "zero_start": True,
|
||||
# "zero_goal": False,
|
||||
# "policy_kwargs": {
|
||||
# "p_gains": 0.5*np.array([1.0, 4.0, 2.0, 4.0, 1.0, 4.0, 1.0]),
|
||||
# "d_gains": 0.5*np.array([0.1, 0.4, 0.2, 0.4, 0.1, 0.4, 0.1])
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# )
|
||||
# ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("TableTennisProMP-v2")
|
||||
ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
|
@ -7,21 +7,16 @@ from gym.envs.mujoco import MujocoEnv
|
||||
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
|
||||
|
||||
|
||||
# CUP_POS_MIN = np.array([-0.32, -2.2])
|
||||
# CUP_POS_MAX = np.array([0.32, -1.2])
|
||||
|
||||
CUP_POS_MIN = np.array([-1.42, -4.05])
|
||||
CUP_POS_MAX = np.array([1.42, -1.25])
|
||||
CUP_POS_MIN = np.array([-0.32, -2.2])
|
||||
CUP_POS_MAX = np.array([0.32, -1.2])
|
||||
|
||||
|
||||
class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False,
|
||||
rndm_goal=False, cup_goal_pos=None):
|
||||
if cup_goal_pos is None:
|
||||
cup_goal_pos = [-0.3, -1.2, 0.840]
|
||||
elif len(cup_goal_pos)==2:
|
||||
cup_goal_pos = np.array(cup_goal_pos)
|
||||
cup_goal_pos = np.insert(cup_goal_pos, 2, 0.80)
|
||||
cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840])
|
||||
if cup_goal_pos.shape[0]==2:
|
||||
cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840)
|
||||
self.cup_goal_pos = np.array(cup_goal_pos)
|
||||
|
||||
self._steps = 0
|
||||
@ -52,7 +47,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||
else:
|
||||
self.noise_std = 0
|
||||
|
||||
|
||||
reward_function = BeerPongReward
|
||||
self.reward_function = reward_function()
|
||||
|
||||
@ -94,7 +88,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||
start_pos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy()
|
||||
self.set_state(start_pos, init_vel)
|
||||
if self.rndm_goal:
|
||||
xy = np.random.uniform(CUP_POS_MIN, CUP_POS_MAX)
|
||||
xy = self.np_random.uniform(CUP_POS_MIN, CUP_POS_MAX)
|
||||
xyz = np.zeros(3)
|
||||
xyz[:2] = xy
|
||||
xyz[-1] = 0.840
|
||||
|
@ -54,6 +54,7 @@ class BeerPongReward:
|
||||
self.ball_table_contact = False
|
||||
self.ball_wall_contact = False
|
||||
self.ball_cup_contact = False
|
||||
self.ball_in_cup = False
|
||||
self.noisy_bp = noisy
|
||||
self._t_min_final_dist = -1
|
||||
|
||||
@ -80,39 +81,94 @@ class BeerPongReward:
|
||||
|
||||
action_cost = np.sum(np.square(action))
|
||||
self.action_costs.append(action_cost)
|
||||
# ##################### Reward function which forces to bounce once on the table (tanh) ########################
|
||||
# if not self.ball_table_contact:
|
||||
# self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id,
|
||||
# self.table_collision_id)
|
||||
|
||||
if not self.ball_table_contact:
|
||||
self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id,
|
||||
self.table_collision_id)
|
||||
# self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
|
||||
# if env._steps == env.ep_length - 1 or self._is_collided:
|
||||
# min_dist = np.min(self.dists)
|
||||
# final_dist = self.dists_final[-1]
|
||||
#
|
||||
# ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id,
|
||||
# self.cup_table_collision_id)
|
||||
#
|
||||
# # encourage bounce before falling into cup
|
||||
# if not ball_in_cup:
|
||||
# if not self.ball_table_contact:
|
||||
# reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
|
||||
# else:
|
||||
# reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
|
||||
# else:
|
||||
# if not self.ball_table_contact:
|
||||
# reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1
|
||||
# else:
|
||||
# reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
|
||||
#
|
||||
# # reward = - 1 * cost - self.collision_penalty * int(self._is_collided)
|
||||
# success = ball_in_cup
|
||||
# crash = self._is_collided
|
||||
# else:
|
||||
# reward = - 1e-2 * action_cost
|
||||
# success = False
|
||||
# crash = False
|
||||
# ################################################################################################################
|
||||
|
||||
# ##################### Reward function which does not force to bounce once on the table (tanh) ################
|
||||
# self._check_contacts(env.sim)
|
||||
# self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
|
||||
# if env._steps == env.ep_length - 1 or self._is_collided:
|
||||
# min_dist = np.min(self.dists)
|
||||
# final_dist = self.dists_final[-1]
|
||||
#
|
||||
# # encourage bounce before falling into cup
|
||||
# if not self.ball_in_cup:
|
||||
# if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
|
||||
# min_dist_coeff, final_dist_coeff, rew_offset = 0.2, 0.1, 0
|
||||
# # reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
|
||||
# else:
|
||||
# min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, 0
|
||||
# # reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
|
||||
# else:
|
||||
# min_dist_coeff, final_dist_coeff, rew_offset = 1, 2, 3
|
||||
# # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
|
||||
#
|
||||
# reward = final_dist_coeff * (1 - np.tanh(0.5 * final_dist)) + min_dist_coeff * (1 - np.tanh(0.5 * min_dist)) \
|
||||
# + rew_offset
|
||||
# success = self.ball_in_cup
|
||||
# crash = self._is_collided
|
||||
# else:
|
||||
# reward = - 1e-2 * action_cost
|
||||
# success = False
|
||||
# crash = False
|
||||
# ################################################################################################################
|
||||
|
||||
# ##################### Reward function which does not force to bounce once on the table (quad dist) ############
|
||||
self._check_contacts(env.sim)
|
||||
self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
|
||||
if env._steps == env.ep_length - 1 or self._is_collided:
|
||||
|
||||
min_dist = np.min(self.dists)
|
||||
final_dist = self.dists_final[-1]
|
||||
|
||||
ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id,
|
||||
self.cup_table_collision_id)
|
||||
|
||||
# encourage bounce before falling into cup
|
||||
if not ball_in_cup:
|
||||
if not self.ball_table_contact:
|
||||
reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
|
||||
if not self.ball_in_cup:
|
||||
if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
|
||||
min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -4
|
||||
else:
|
||||
reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
|
||||
min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -2
|
||||
else:
|
||||
if not self.ball_table_contact:
|
||||
reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1
|
||||
else:
|
||||
reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
|
||||
min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, 0
|
||||
|
||||
# reward = - 1 * cost - self.collision_penalty * int(self._is_collided)
|
||||
success = ball_in_cup
|
||||
reward = rew_offset - min_dist_coeff * min_dist**2 - final_dist_coeff * final_dist**2 - \
|
||||
1e-4*np.mean(action_cost)
|
||||
success = self.ball_in_cup
|
||||
crash = self._is_collided
|
||||
else:
|
||||
reward = - 1e-2 * action_cost
|
||||
success = False
|
||||
crash = False
|
||||
# ################################################################################################################
|
||||
|
||||
infos = {}
|
||||
infos["success"] = success
|
||||
@ -124,6 +180,20 @@ class BeerPongReward:
|
||||
|
||||
return reward, infos
|
||||
|
||||
def _check_contacts(self, sim):
|
||||
if not self.ball_table_contact:
|
||||
self.ball_table_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
|
||||
self.table_collision_id)
|
||||
if not self.ball_cup_contact:
|
||||
self.ball_cup_contact = self._check_collision_with_set_of_objects(sim, self.ball_collision_id,
|
||||
self.cup_collision_ids)
|
||||
if not self.ball_wall_contact:
|
||||
self.ball_wall_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
|
||||
self.wall_collision_id)
|
||||
if not self.ball_in_cup:
|
||||
self.ball_in_cup = self._check_collision_single_objects(sim, self.ball_collision_id,
|
||||
self.cup_table_collision_id)
|
||||
|
||||
def _check_collision_single_objects(self, sim, id_1, id_2):
|
||||
for coni in range(0, sim.data.ncon):
|
||||
con = sim.data.contact[coni]
|
||||
|
Loading…
Reference in New Issue
Block a user