smaller ctxt range hopper jump + step-based rew for bp
This commit is contained in:
parent
24604e60be
commit
8b8be4b582
@ -253,7 +253,7 @@ register(
|
||||
kwargs={
|
||||
"max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP,
|
||||
"context": False,
|
||||
"healthy_rewasrd": 1.0
|
||||
"healthy_reward": 1.0
|
||||
}
|
||||
)
|
||||
register(
|
||||
@ -405,7 +405,7 @@ register(id='TableTennis2DCtxt-v1',
|
||||
kwargs={'ctxt_dim': 2, 'fixed_goal': True})
|
||||
|
||||
register(id='TableTennis4DCtxt-v0',
|
||||
entry_point='alr_envs.alr.mujoco:TTEnvGym',
|
||||
entry_point='alr_envs.alr.mujocco:TTEnvGym',
|
||||
max_episode_steps=MAX_EPISODE_STEPS,
|
||||
kwargs={'ctxt_dim': 4})
|
||||
|
||||
@ -915,7 +915,7 @@ for _v in _versions:
|
||||
"basis_generator_kwargs": {
|
||||
'basis_generator_type': 'zero_rbf',
|
||||
'num_basis': 5,
|
||||
'num_basis_zero_start': 2
|
||||
'num_basis_zero_start': 1
|
||||
}
|
||||
}
|
||||
)
|
||||
|
@ -187,35 +187,65 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||
|
||||
|
||||
class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
|
||||
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
||||
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
||||
self.release_step = 62 # empirically evaluated for frame_skip=2!
|
||||
|
||||
def _set_action_space(self):
|
||||
bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space()
|
||||
min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype)
|
||||
max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype)
|
||||
self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype)
|
||||
return self.action_space
|
||||
# def _set_action_space(self):
|
||||
# bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space()
|
||||
# min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype)
|
||||
# max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype)
|
||||
# self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype)
|
||||
# return self.action_space
|
||||
|
||||
# def step(self, a):
|
||||
# self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step
|
||||
# return super(ALRBeerBongEnvStepBased, self).step(a[1:])
|
||||
#
|
||||
# def reset(self):
|
||||
# ob = super(ALRBeerBongEnvStepBased, self).reset()
|
||||
# self.release_step = self.ep_length + 1
|
||||
# return ob
|
||||
|
||||
def step(self, a):
|
||||
self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step
|
||||
return super(ALRBeerBongEnvStepBased, self).step(a[1:])
|
||||
if self._steps < self.release_step:
|
||||
return super(ALRBeerBongEnvStepBased, self).step(a)
|
||||
else:
|
||||
reward = 0
|
||||
done = False
|
||||
while not done:
|
||||
sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBased, self).step(np.zeros(a.shape))
|
||||
if not done or sub_infos['sim_crash']:
|
||||
reward += sub_reward
|
||||
else:
|
||||
ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
|
||||
cup_goal_dist_final = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
|
||||
self.sim.model._site_name2id["cup_goal_final_table"]].copy())
|
||||
cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
|
||||
self.sim.model._site_name2id["cup_goal_table"]].copy())
|
||||
if sub_infos['success']:
|
||||
dist_rew = -cup_goal_dist_final**2
|
||||
else:
|
||||
dist_rew = -0.5*cup_goal_dist_final**2 - cup_goal_dist_top**2
|
||||
reward = reward - sub_infos['action_cost'] + dist_rew
|
||||
infos = sub_infos
|
||||
ob = sub_ob
|
||||
return ob, reward, done, infos
|
||||
|
||||
|
||||
def reset(self):
|
||||
ob = super(ALRBeerBongEnvStepBased, self).reset()
|
||||
self.release_step = self.ep_length + 1
|
||||
return ob
|
||||
|
||||
if __name__ == "__main__":
|
||||
# env = ALRBeerBongEnv(rndm_goal=True)
|
||||
env = ALRBeerBongEnvStepBased(rndm_goal=True)
|
||||
env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True)
|
||||
import time
|
||||
env.reset()
|
||||
env.render("human")
|
||||
for i in range(1500):
|
||||
# ac = 10 * env.action_space.sample()[0:7]
|
||||
ac = np.zeros(8)
|
||||
ac[0] = -1
|
||||
if env._steps > 150:
|
||||
ac[0] = 1
|
||||
ac = 10 * env.action_space.sample()
|
||||
# ac = np.zeros(7)
|
||||
# ac[0] = -1
|
||||
# if env._steps > 150:
|
||||
# ac[0] = 1
|
||||
obs, rew, d, info = env.step(ac)
|
||||
env.render("human")
|
||||
print(env.dt)
|
||||
|
@ -90,7 +90,7 @@ class BeerPongReward:
|
||||
self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \
|
||||
if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup
|
||||
action_cost = np.sum(np.square(action))
|
||||
self.action_costs.append(action_cost)
|
||||
self.action_costs.append(np.copy(action_cost))
|
||||
# # ##################### Reward function which does not force to bounce once on the table (quad dist) ############
|
||||
|
||||
self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
|
||||
@ -110,8 +110,9 @@ class BeerPongReward:
|
||||
else:
|
||||
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0
|
||||
# dist_ground_cup = 1 * self.dist_ground_cup
|
||||
action_cost = 1e-4 * np.mean(action_cost)
|
||||
reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \
|
||||
1e-4 * np.mean(action_cost) - ground_contact_dist_coeff*self.dist_ground_cup ** 2
|
||||
action_cost - ground_contact_dist_coeff*self.dist_ground_cup ** 2
|
||||
# 1e-7*np.mean(action_cost)
|
||||
# release step punishment
|
||||
min_time_bound = 0.1
|
||||
@ -124,7 +125,8 @@ class BeerPongReward:
|
||||
# print('release time :', release_time)
|
||||
# print('dist_ground_cup :', dist_ground_cup)
|
||||
else:
|
||||
reward = - 1e-2 * action_cost
|
||||
action_cost = 1e-2 * action_cost
|
||||
reward = - action_cost
|
||||
# reward = - 1e-4 * action_cost
|
||||
# reward = 0
|
||||
success = False
|
||||
|
@ -80,7 +80,8 @@ class ALRHopperJumpEnv(HopperEnv):
|
||||
'x_pos': site_pos_after,
|
||||
'max_height': self.max_height,
|
||||
'height_rew': self.max_height,
|
||||
'healthy_reward': self.healthy_reward * 2
|
||||
'healthy_reward': self.healthy_reward * 2,
|
||||
'healthy': self.is_healthy
|
||||
}
|
||||
|
||||
return observation, reward, done, info
|
||||
@ -171,7 +172,8 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
|
||||
'goal': self.goal,
|
||||
'goal_dist': goal_dist,
|
||||
'height_rew': self.max_height,
|
||||
'healthy_reward': self.healthy_reward * 2
|
||||
'healthy_reward': self.healthy_reward * 2,
|
||||
'healthy': self.is_healthy
|
||||
}
|
||||
return observation, reward, done, info
|
||||
|
||||
@ -207,13 +209,38 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
|
||||
super().reset()
|
||||
# self.goal = np.random.uniform(-1.5, 1.5, 1)
|
||||
# self.goal = np.random.uniform(0, 1.5, 1)
|
||||
self.goal = self.np_random.uniform(0, 1.5, 1)
|
||||
# self.goal = np.array([1.5])
|
||||
# self.goal = self.np_random.uniform(0, 1.5, 1)
|
||||
self.goal = self.np_random.uniform(0.3, 1.35, 1)
|
||||
self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0], dtype=object)
|
||||
return self.reset_model()
|
||||
|
||||
|
||||
class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
|
||||
|
||||
def __init__(self,
|
||||
xml_file='hopper_jump.xml',
|
||||
forward_reward_weight=1.0,
|
||||
ctrl_cost_weight=1e-3,
|
||||
healthy_reward=0.0,
|
||||
penalty=0.0,
|
||||
context=True,
|
||||
terminate_when_unhealthy=False,
|
||||
healthy_state_range=(-100.0, 100.0),
|
||||
healthy_z_range=(0.5, float('inf')),
|
||||
healthy_angle_range=(-float('inf'), float('inf')),
|
||||
reset_noise_scale=5e-3,
|
||||
exclude_current_positions_from_observation=True,
|
||||
max_episode_steps=250,
|
||||
height_scale = 10,
|
||||
dist_scale = 3,
|
||||
healthy_scale = 2):
|
||||
self.height_scale = height_scale
|
||||
self.dist_scale = dist_scale
|
||||
self.healthy_scale = healthy_scale
|
||||
super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, penalty, context,
|
||||
terminate_when_unhealthy, healthy_state_range, healthy_z_range, healthy_angle_range,
|
||||
reset_noise_scale, exclude_current_positions_from_observation, max_episode_steps)
|
||||
|
||||
def step(self, action):
|
||||
|
||||
self._floor_geom_id = self.model.geom_name2id('floor')
|
||||
@ -226,10 +253,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
|
||||
self.max_height = max(height_after, self.max_height)
|
||||
ctrl_cost = self.control_cost(action)
|
||||
|
||||
healthy_reward = self.healthy_reward * 2
|
||||
height_reward = 10*height_after
|
||||
healthy_reward = self.healthy_reward * self.healthy_scale
|
||||
height_reward = self.height_scale*height_after
|
||||
goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0]
|
||||
goal_dist_reward = -3*goal_dist
|
||||
goal_dist_reward = -self.dist_scale*goal_dist
|
||||
dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward)
|
||||
reward = -ctrl_cost + healthy_reward + dist_reward
|
||||
done = False
|
||||
@ -240,12 +267,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
|
||||
'max_height': self.max_height,
|
||||
'goal': self.goal,
|
||||
'goal_dist': goal_dist,
|
||||
'dist_rew': dist_reward,
|
||||
'height_rew': height_reward,
|
||||
'healthy_reward': healthy_reward,
|
||||
'ctrl_reward': -ctrl_cost
|
||||
'height_rew': self.max_height,
|
||||
'healthy_reward': self.healthy_reward * 2,
|
||||
'healthy': self.is_healthy
|
||||
}
|
||||
|
||||
return observation, reward, done, info
|
||||
|
||||
class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
|
||||
|
Loading…
Reference in New Issue
Block a user