smaller ctxt range hopper jump + step-based rew for bp

This commit is contained in:
Onur 2022-06-04 15:27:20 +02:00
parent 24604e60be
commit 8b8be4b582
4 changed files with 93 additions and 36 deletions

View File

@ -253,7 +253,7 @@ register(
kwargs={ kwargs={
"max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP, "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP,
"context": False, "context": False,
"healthy_rewasrd": 1.0 "healthy_reward": 1.0
} }
) )
register( register(
@ -405,7 +405,7 @@ register(id='TableTennis2DCtxt-v1',
kwargs={'ctxt_dim': 2, 'fixed_goal': True}) kwargs={'ctxt_dim': 2, 'fixed_goal': True})
register(id='TableTennis4DCtxt-v0', register(id='TableTennis4DCtxt-v0',
entry_point='alr_envs.alr.mujoco:TTEnvGym', entry_point='alr_envs.alr.mujocco:TTEnvGym',
max_episode_steps=MAX_EPISODE_STEPS, max_episode_steps=MAX_EPISODE_STEPS,
kwargs={'ctxt_dim': 4}) kwargs={'ctxt_dim': 4})
@ -915,7 +915,7 @@ for _v in _versions:
"basis_generator_kwargs": { "basis_generator_kwargs": {
'basis_generator_type': 'zero_rbf', 'basis_generator_type': 'zero_rbf',
'num_basis': 5, 'num_basis': 5,
'num_basis_zero_start': 2 'num_basis_zero_start': 1
} }
} }
) )

View File

@ -187,35 +187,65 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
class ALRBeerBongEnvStepBased(ALRBeerBongEnv): class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
self.release_step = 62 # empirically evaluated for frame_skip=2!
def _set_action_space(self): # def _set_action_space(self):
bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space() # bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space()
min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype) # min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype)
max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype) # max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype)
self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype) # self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype)
return self.action_space # return self.action_space
# def step(self, a):
# self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step
# return super(ALRBeerBongEnvStepBased, self).step(a[1:])
#
# def reset(self):
# ob = super(ALRBeerBongEnvStepBased, self).reset()
# self.release_step = self.ep_length + 1
# return ob
def step(self, a): def step(self, a):
self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step if self._steps < self.release_step:
return super(ALRBeerBongEnvStepBased, self).step(a[1:]) return super(ALRBeerBongEnvStepBased, self).step(a)
else:
reward = 0
done = False
while not done:
sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBased, self).step(np.zeros(a.shape))
if not done or sub_infos['sim_crash']:
reward += sub_reward
else:
ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
cup_goal_dist_final = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
self.sim.model._site_name2id["cup_goal_final_table"]].copy())
cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
self.sim.model._site_name2id["cup_goal_table"]].copy())
if sub_infos['success']:
dist_rew = -cup_goal_dist_final**2
else:
dist_rew = -0.5*cup_goal_dist_final**2 - cup_goal_dist_top**2
reward = reward - sub_infos['action_cost'] + dist_rew
infos = sub_infos
ob = sub_ob
return ob, reward, done, infos
def reset(self):
ob = super(ALRBeerBongEnvStepBased, self).reset()
self.release_step = self.ep_length + 1
return ob
if __name__ == "__main__": if __name__ == "__main__":
# env = ALRBeerBongEnv(rndm_goal=True) # env = ALRBeerBongEnv(rndm_goal=True)
env = ALRBeerBongEnvStepBased(rndm_goal=True) env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True)
import time import time
env.reset() env.reset()
env.render("human") env.render("human")
for i in range(1500): for i in range(1500):
# ac = 10 * env.action_space.sample()[0:7] ac = 10 * env.action_space.sample()
ac = np.zeros(8) # ac = np.zeros(7)
ac[0] = -1 # ac[0] = -1
if env._steps > 150: # if env._steps > 150:
ac[0] = 1 # ac[0] = 1
obs, rew, d, info = env.step(ac) obs, rew, d, info = env.step(ac)
env.render("human") env.render("human")
print(env.dt) print(env.dt)

View File

@ -90,7 +90,7 @@ class BeerPongReward:
self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \ self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \
if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup
action_cost = np.sum(np.square(action)) action_cost = np.sum(np.square(action))
self.action_costs.append(action_cost) self.action_costs.append(np.copy(action_cost))
# # ##################### Reward function which does not force to bounce once on the table (quad dist) ############ # # ##################### Reward function which does not force to bounce once on the table (quad dist) ############
self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids) self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
@ -110,8 +110,9 @@ class BeerPongReward:
else: else:
min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0 min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0
# dist_ground_cup = 1 * self.dist_ground_cup # dist_ground_cup = 1 * self.dist_ground_cup
action_cost = 1e-4 * np.mean(action_cost)
reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \ reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \
1e-4 * np.mean(action_cost) - ground_contact_dist_coeff*self.dist_ground_cup ** 2 action_cost - ground_contact_dist_coeff*self.dist_ground_cup ** 2
# 1e-7*np.mean(action_cost) # 1e-7*np.mean(action_cost)
# release step punishment # release step punishment
min_time_bound = 0.1 min_time_bound = 0.1
@ -124,7 +125,8 @@ class BeerPongReward:
# print('release time :', release_time) # print('release time :', release_time)
# print('dist_ground_cup :', dist_ground_cup) # print('dist_ground_cup :', dist_ground_cup)
else: else:
reward = - 1e-2 * action_cost action_cost = 1e-2 * action_cost
reward = - action_cost
# reward = - 1e-4 * action_cost # reward = - 1e-4 * action_cost
# reward = 0 # reward = 0
success = False success = False

View File

@ -80,7 +80,8 @@ class ALRHopperJumpEnv(HopperEnv):
'x_pos': site_pos_after, 'x_pos': site_pos_after,
'max_height': self.max_height, 'max_height': self.max_height,
'height_rew': self.max_height, 'height_rew': self.max_height,
'healthy_reward': self.healthy_reward * 2 'healthy_reward': self.healthy_reward * 2,
'healthy': self.is_healthy
} }
return observation, reward, done, info return observation, reward, done, info
@ -171,7 +172,8 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
'goal': self.goal, 'goal': self.goal,
'goal_dist': goal_dist, 'goal_dist': goal_dist,
'height_rew': self.max_height, 'height_rew': self.max_height,
'healthy_reward': self.healthy_reward * 2 'healthy_reward': self.healthy_reward * 2,
'healthy': self.is_healthy
} }
return observation, reward, done, info return observation, reward, done, info
@ -207,13 +209,38 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
super().reset() super().reset()
# self.goal = np.random.uniform(-1.5, 1.5, 1) # self.goal = np.random.uniform(-1.5, 1.5, 1)
# self.goal = np.random.uniform(0, 1.5, 1) # self.goal = np.random.uniform(0, 1.5, 1)
self.goal = self.np_random.uniform(0, 1.5, 1) # self.goal = self.np_random.uniform(0, 1.5, 1)
# self.goal = np.array([1.5]) self.goal = self.np_random.uniform(0.3, 1.35, 1)
self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0], dtype=object) self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0], dtype=object)
return self.reset_model() return self.reset_model()
class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv): class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
def __init__(self,
xml_file='hopper_jump.xml',
forward_reward_weight=1.0,
ctrl_cost_weight=1e-3,
healthy_reward=0.0,
penalty=0.0,
context=True,
terminate_when_unhealthy=False,
healthy_state_range=(-100.0, 100.0),
healthy_z_range=(0.5, float('inf')),
healthy_angle_range=(-float('inf'), float('inf')),
reset_noise_scale=5e-3,
exclude_current_positions_from_observation=True,
max_episode_steps=250,
height_scale = 10,
dist_scale = 3,
healthy_scale = 2):
self.height_scale = height_scale
self.dist_scale = dist_scale
self.healthy_scale = healthy_scale
super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, penalty, context,
terminate_when_unhealthy, healthy_state_range, healthy_z_range, healthy_angle_range,
reset_noise_scale, exclude_current_positions_from_observation, max_episode_steps)
def step(self, action): def step(self, action):
self._floor_geom_id = self.model.geom_name2id('floor') self._floor_geom_id = self.model.geom_name2id('floor')
@ -226,10 +253,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
self.max_height = max(height_after, self.max_height) self.max_height = max(height_after, self.max_height)
ctrl_cost = self.control_cost(action) ctrl_cost = self.control_cost(action)
healthy_reward = self.healthy_reward * 2 healthy_reward = self.healthy_reward * self.healthy_scale
height_reward = 10*height_after height_reward = self.height_scale*height_after
goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0] goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0]
goal_dist_reward = -3*goal_dist goal_dist_reward = -self.dist_scale*goal_dist
dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward) dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward)
reward = -ctrl_cost + healthy_reward + dist_reward reward = -ctrl_cost + healthy_reward + dist_reward
done = False done = False
@ -240,12 +267,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
'max_height': self.max_height, 'max_height': self.max_height,
'goal': self.goal, 'goal': self.goal,
'goal_dist': goal_dist, 'goal_dist': goal_dist,
'dist_rew': dist_reward, 'height_rew': self.max_height,
'height_rew': height_reward, 'healthy_reward': self.healthy_reward * 2,
'healthy_reward': healthy_reward, 'healthy': self.is_healthy
'ctrl_reward': -ctrl_cost
} }
return observation, reward, done, info return observation, reward, done, info
class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv): class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):