fixing beerpong rewards
This commit is contained in:
parent
04b6b314cf
commit
eb7dd3a18f
@ -430,29 +430,3 @@ for _v, cd in enumerate(ctxt_dim):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
|
ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
|
||||||
|
|
||||||
# register(
|
|
||||||
# id='TableTennisProMP-v2',
|
|
||||||
# entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper',
|
|
||||||
# kwargs={
|
|
||||||
# "name": "alr_envs:TableTennis2DCtxt-v1",
|
|
||||||
# "wrappers": [mujoco.table_tennis.MPWrapper],
|
|
||||||
# "mp_kwargs": {
|
|
||||||
# "num_dof": 7,
|
|
||||||
# "num_basis": 2,
|
|
||||||
# "duration": 1.25,
|
|
||||||
# "post_traj_time": 4.5,
|
|
||||||
# #"width": 0.01,
|
|
||||||
# #"off": 0.01,
|
|
||||||
# "policy_type": "motor",
|
|
||||||
# "weights_scale": 1.0,
|
|
||||||
# "zero_start": True,
|
|
||||||
# "zero_goal": False,
|
|
||||||
# "policy_kwargs": {
|
|
||||||
# "p_gains": 0.5*np.array([1.0, 4.0, 2.0, 4.0, 1.0, 4.0, 1.0]),
|
|
||||||
# "d_gains": 0.5*np.array([0.1, 0.4, 0.2, 0.4, 0.1, 0.4, 0.1])
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
# }
|
|
||||||
# )
|
|
||||||
# ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("TableTennisProMP-v2")
|
|
||||||
|
@ -7,21 +7,16 @@ from gym.envs.mujoco import MujocoEnv
|
|||||||
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
|
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
|
||||||
|
|
||||||
|
|
||||||
# CUP_POS_MIN = np.array([-0.32, -2.2])
|
CUP_POS_MIN = np.array([-0.32, -2.2])
|
||||||
# CUP_POS_MAX = np.array([0.32, -1.2])
|
CUP_POS_MAX = np.array([0.32, -1.2])
|
||||||
|
|
||||||
CUP_POS_MIN = np.array([-1.42, -4.05])
|
|
||||||
CUP_POS_MAX = np.array([1.42, -1.25])
|
|
||||||
|
|
||||||
|
|
||||||
class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||||
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False,
|
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False,
|
||||||
rndm_goal=False, cup_goal_pos=None):
|
rndm_goal=False, cup_goal_pos=None):
|
||||||
if cup_goal_pos is None:
|
cup_goal_pos = np.array(cup_goal_pos if cup_goal_pos is not None else [-0.3, -1.2, 0.840])
|
||||||
cup_goal_pos = [-0.3, -1.2, 0.840]
|
if cup_goal_pos.shape[0]==2:
|
||||||
elif len(cup_goal_pos)==2:
|
cup_goal_pos = np.insert(cup_goal_pos, 2, 0.840)
|
||||||
cup_goal_pos = np.array(cup_goal_pos)
|
|
||||||
cup_goal_pos = np.insert(cup_goal_pos, 2, 0.80)
|
|
||||||
self.cup_goal_pos = np.array(cup_goal_pos)
|
self.cup_goal_pos = np.array(cup_goal_pos)
|
||||||
|
|
||||||
self._steps = 0
|
self._steps = 0
|
||||||
@ -52,7 +47,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
|||||||
else:
|
else:
|
||||||
self.noise_std = 0
|
self.noise_std = 0
|
||||||
|
|
||||||
|
|
||||||
reward_function = BeerPongReward
|
reward_function = BeerPongReward
|
||||||
self.reward_function = reward_function()
|
self.reward_function = reward_function()
|
||||||
|
|
||||||
@ -94,7 +88,7 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
|||||||
start_pos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy()
|
start_pos[7::] = self.sim.data.site_xpos[self.ball_site_id, :].copy()
|
||||||
self.set_state(start_pos, init_vel)
|
self.set_state(start_pos, init_vel)
|
||||||
if self.rndm_goal:
|
if self.rndm_goal:
|
||||||
xy = np.random.uniform(CUP_POS_MIN, CUP_POS_MAX)
|
xy = self.np_random.uniform(CUP_POS_MIN, CUP_POS_MAX)
|
||||||
xyz = np.zeros(3)
|
xyz = np.zeros(3)
|
||||||
xyz[:2] = xy
|
xyz[:2] = xy
|
||||||
xyz[-1] = 0.840
|
xyz[-1] = 0.840
|
||||||
|
@ -54,6 +54,7 @@ class BeerPongReward:
|
|||||||
self.ball_table_contact = False
|
self.ball_table_contact = False
|
||||||
self.ball_wall_contact = False
|
self.ball_wall_contact = False
|
||||||
self.ball_cup_contact = False
|
self.ball_cup_contact = False
|
||||||
|
self.ball_in_cup = False
|
||||||
self.noisy_bp = noisy
|
self.noisy_bp = noisy
|
||||||
self._t_min_final_dist = -1
|
self._t_min_final_dist = -1
|
||||||
|
|
||||||
@ -80,39 +81,94 @@ class BeerPongReward:
|
|||||||
|
|
||||||
action_cost = np.sum(np.square(action))
|
action_cost = np.sum(np.square(action))
|
||||||
self.action_costs.append(action_cost)
|
self.action_costs.append(action_cost)
|
||||||
|
# ##################### Reward function which forces to bounce once on the table (tanh) ########################
|
||||||
|
# if not self.ball_table_contact:
|
||||||
|
# self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id,
|
||||||
|
# self.table_collision_id)
|
||||||
|
|
||||||
if not self.ball_table_contact:
|
# self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
|
||||||
self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id,
|
# if env._steps == env.ep_length - 1 or self._is_collided:
|
||||||
self.table_collision_id)
|
# min_dist = np.min(self.dists)
|
||||||
|
# final_dist = self.dists_final[-1]
|
||||||
|
#
|
||||||
|
# ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id,
|
||||||
|
# self.cup_table_collision_id)
|
||||||
|
#
|
||||||
|
# # encourage bounce before falling into cup
|
||||||
|
# if not ball_in_cup:
|
||||||
|
# if not self.ball_table_contact:
|
||||||
|
# reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
|
||||||
|
# else:
|
||||||
|
# reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
|
||||||
|
# else:
|
||||||
|
# if not self.ball_table_contact:
|
||||||
|
# reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1
|
||||||
|
# else:
|
||||||
|
# reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
|
||||||
|
#
|
||||||
|
# # reward = - 1 * cost - self.collision_penalty * int(self._is_collided)
|
||||||
|
# success = ball_in_cup
|
||||||
|
# crash = self._is_collided
|
||||||
|
# else:
|
||||||
|
# reward = - 1e-2 * action_cost
|
||||||
|
# success = False
|
||||||
|
# crash = False
|
||||||
|
# ################################################################################################################
|
||||||
|
|
||||||
|
# ##################### Reward function which does not force to bounce once on the table (tanh) ################
|
||||||
|
# self._check_contacts(env.sim)
|
||||||
|
# self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
|
||||||
|
# if env._steps == env.ep_length - 1 or self._is_collided:
|
||||||
|
# min_dist = np.min(self.dists)
|
||||||
|
# final_dist = self.dists_final[-1]
|
||||||
|
#
|
||||||
|
# # encourage bounce before falling into cup
|
||||||
|
# if not self.ball_in_cup:
|
||||||
|
# if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
|
||||||
|
# min_dist_coeff, final_dist_coeff, rew_offset = 0.2, 0.1, 0
|
||||||
|
# # reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
|
||||||
|
# else:
|
||||||
|
# min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, 0
|
||||||
|
# # reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
|
||||||
|
# else:
|
||||||
|
# min_dist_coeff, final_dist_coeff, rew_offset = 1, 2, 3
|
||||||
|
# # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
|
||||||
|
#
|
||||||
|
# reward = final_dist_coeff * (1 - np.tanh(0.5 * final_dist)) + min_dist_coeff * (1 - np.tanh(0.5 * min_dist)) \
|
||||||
|
# + rew_offset
|
||||||
|
# success = self.ball_in_cup
|
||||||
|
# crash = self._is_collided
|
||||||
|
# else:
|
||||||
|
# reward = - 1e-2 * action_cost
|
||||||
|
# success = False
|
||||||
|
# crash = False
|
||||||
|
# ################################################################################################################
|
||||||
|
|
||||||
|
# ##################### Reward function which does not force to bounce once on the table (quad dist) ############
|
||||||
|
self._check_contacts(env.sim)
|
||||||
self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
|
self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
|
||||||
if env._steps == env.ep_length - 1 or self._is_collided:
|
if env._steps == env.ep_length - 1 or self._is_collided:
|
||||||
|
|
||||||
min_dist = np.min(self.dists)
|
min_dist = np.min(self.dists)
|
||||||
final_dist = self.dists_final[-1]
|
final_dist = self.dists_final[-1]
|
||||||
|
|
||||||
ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id,
|
|
||||||
self.cup_table_collision_id)
|
|
||||||
|
|
||||||
# encourage bounce before falling into cup
|
# encourage bounce before falling into cup
|
||||||
if not ball_in_cup:
|
if not self.ball_in_cup:
|
||||||
if not self.ball_table_contact:
|
if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
|
||||||
reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
|
min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -4
|
||||||
else:
|
else:
|
||||||
reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
|
min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -2
|
||||||
else:
|
else:
|
||||||
if not self.ball_table_contact:
|
min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, 0
|
||||||
reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1
|
|
||||||
else:
|
|
||||||
reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
|
|
||||||
|
|
||||||
# reward = - 1 * cost - self.collision_penalty * int(self._is_collided)
|
reward = rew_offset - min_dist_coeff * min_dist**2 - final_dist_coeff * final_dist**2 - \
|
||||||
success = ball_in_cup
|
1e-4*np.mean(action_cost)
|
||||||
|
success = self.ball_in_cup
|
||||||
crash = self._is_collided
|
crash = self._is_collided
|
||||||
else:
|
else:
|
||||||
reward = - 1e-2 * action_cost
|
reward = - 1e-2 * action_cost
|
||||||
success = False
|
success = False
|
||||||
crash = False
|
crash = False
|
||||||
|
# ################################################################################################################
|
||||||
|
|
||||||
infos = {}
|
infos = {}
|
||||||
infos["success"] = success
|
infos["success"] = success
|
||||||
@ -124,6 +180,20 @@ class BeerPongReward:
|
|||||||
|
|
||||||
return reward, infos
|
return reward, infos
|
||||||
|
|
||||||
|
def _check_contacts(self, sim):
|
||||||
|
if not self.ball_table_contact:
|
||||||
|
self.ball_table_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
|
||||||
|
self.table_collision_id)
|
||||||
|
if not self.ball_cup_contact:
|
||||||
|
self.ball_cup_contact = self._check_collision_with_set_of_objects(sim, self.ball_collision_id,
|
||||||
|
self.cup_collision_ids)
|
||||||
|
if not self.ball_wall_contact:
|
||||||
|
self.ball_wall_contact = self._check_collision_single_objects(sim, self.ball_collision_id,
|
||||||
|
self.wall_collision_id)
|
||||||
|
if not self.ball_in_cup:
|
||||||
|
self.ball_in_cup = self._check_collision_single_objects(sim, self.ball_collision_id,
|
||||||
|
self.cup_table_collision_id)
|
||||||
|
|
||||||
def _check_collision_single_objects(self, sim, id_1, id_2):
|
def _check_collision_single_objects(self, sim, id_1, id_2):
|
||||||
for coni in range(0, sim.data.ncon):
|
for coni in range(0, sim.data.ncon):
|
||||||
con = sim.data.contact[coni]
|
con = sim.data.contact[coni]
|
||||||
|
Loading…
Reference in New Issue
Block a user