started cleaning up init. DMP envs are still not transferred. Wrappers for various environments still missing
This commit is contained in:
parent
9ad6fbe712
commit
8fe6a83271
File diff suppressed because it is too large
Load Diff
@ -178,8 +178,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
|
||||
[self._steps],
|
||||
])
|
||||
|
||||
def compute_reward(self):
|
||||
|
||||
@property
|
||||
def dt(self):
|
||||
return super(ALRBeerBongEnv, self).dt * self.repeat_action
|
||||
@ -213,37 +211,37 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
|
||||
return ob, reward, done, infos
|
||||
|
||||
|
||||
class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
|
||||
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
||||
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
||||
self.release_step = 62 # empirically evaluated for frame_skip=2!
|
||||
|
||||
def step(self, a):
|
||||
if self._steps < self.release_step:
|
||||
return super(ALRBeerBongEnvStepBased, self).step(a)
|
||||
else:
|
||||
reward = 0
|
||||
done = False
|
||||
while not done:
|
||||
sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBased, self).step(np.zeros(a.shape))
|
||||
if not done or sub_infos['sim_crash']:
|
||||
reward += sub_reward
|
||||
else:
|
||||
ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
|
||||
cup_goal_dist_final = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
|
||||
self.sim.model._site_name2id["cup_goal_final_table"]].copy())
|
||||
cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
|
||||
self.sim.model._site_name2id["cup_goal_table"]].copy())
|
||||
if sub_infos['success']:
|
||||
dist_rew = -cup_goal_dist_final ** 2
|
||||
else:
|
||||
dist_rew = -0.5 * cup_goal_dist_final ** 2 - cup_goal_dist_top ** 2
|
||||
reward = reward - sub_infos['action_cost'] + dist_rew
|
||||
infos = sub_infos
|
||||
ob = sub_ob
|
||||
ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
|
||||
# internal steps and thus, the observation also needs to be set correctly
|
||||
return ob, reward, done, infos
|
||||
# class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
|
||||
# def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
|
||||
# super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
|
||||
# self.release_step = 62 # empirically evaluated for frame_skip=2!
|
||||
#
|
||||
# def step(self, a):
|
||||
# if self._steps < self.release_step:
|
||||
# return super(ALRBeerBongEnvStepBased, self).step(a)
|
||||
# else:
|
||||
# reward = 0
|
||||
# done = False
|
||||
# while not done:
|
||||
# sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBased, self).step(np.zeros(a.shape))
|
||||
# if not done or sub_infos['sim_crash']:
|
||||
# reward += sub_reward
|
||||
# else:
|
||||
# ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
|
||||
# cup_goal_dist_final = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
|
||||
# self.sim.model._site_name2id["cup_goal_final_table"]].copy())
|
||||
# cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
|
||||
# self.sim.model._site_name2id["cup_goal_table"]].copy())
|
||||
# if sub_infos['success']:
|
||||
# dist_rew = -cup_goal_dist_final ** 2
|
||||
# else:
|
||||
# dist_rew = -0.5 * cup_goal_dist_final ** 2 - cup_goal_dist_top ** 2
|
||||
# reward = reward - sub_infos['action_cost'] + dist_rew
|
||||
# infos = sub_infos
|
||||
# ob = sub_ob
|
||||
# ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
|
||||
# # internal steps and thus, the observation also needs to be set correctly
|
||||
# return ob, reward, done, infos
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -298,76 +298,6 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
|
||||
return observation, reward, done, info
|
||||
|
||||
|
||||
class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
|
||||
def __init__(self, max_episode_steps=250):
|
||||
super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False,
|
||||
reset_noise_scale=5e-1,
|
||||
max_episode_steps=max_episode_steps)
|
||||
|
||||
def reset_model(self):
|
||||
self._floor_geom_id = self.model.geom_name2id('floor')
|
||||
self._foot_geom_id = self.model.geom_name2id('foot_geom')
|
||||
noise_low = -np.ones(self.model.nq) * self._reset_noise_scale
|
||||
noise_low[1] = 0
|
||||
noise_low[2] = 0
|
||||
noise_low[3] = -0.2
|
||||
noise_low[4] = -0.2
|
||||
noise_low[5] = -0.1
|
||||
|
||||
noise_high = np.ones(self.model.nq) * self._reset_noise_scale
|
||||
noise_high[1] = 0
|
||||
noise_high[2] = 0
|
||||
noise_high[3] = 0
|
||||
noise_high[4] = 0
|
||||
noise_high[5] = 0.1
|
||||
|
||||
rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
|
||||
# rnd_vec[2] *= 0.05 # the angle around the y axis shouldn't be too high as the agent then falls down quickly and
|
||||
# can not recover
|
||||
# rnd_vec[1] = np.clip(rnd_vec[1], 0, 0.3)
|
||||
qpos = self.init_qpos + rnd_vec
|
||||
qvel = self.init_qvel
|
||||
|
||||
self.set_state(qpos, qvel)
|
||||
|
||||
observation = self._get_obs()
|
||||
return observation
|
||||
|
||||
def step(self, action):
|
||||
|
||||
self.current_step += 1
|
||||
self.do_simulation(action, self.frame_skip)
|
||||
|
||||
self.contact_with_floor = self._contact_checker(self._floor_geom_id, self._foot_geom_id) if not \
|
||||
self.contact_with_floor else True
|
||||
|
||||
height_after = self.get_body_com("torso")[2]
|
||||
self.max_height = max(height_after, self.max_height) if self.contact_with_floor else 0
|
||||
|
||||
ctrl_cost = self.control_cost(action)
|
||||
costs = ctrl_cost
|
||||
done = False
|
||||
|
||||
if self.current_step >= self.max_episode_steps:
|
||||
healthy_reward = 0
|
||||
height_reward = self._forward_reward_weight * self.max_height # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley
|
||||
rewards = height_reward + healthy_reward
|
||||
|
||||
else:
|
||||
# penalty for wrong start direction of first two joints; not needed, could be removed
|
||||
rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0
|
||||
|
||||
observation = self._get_obs()
|
||||
reward = rewards - costs
|
||||
info = {
|
||||
'height': height_after,
|
||||
'max_height': self.max_height,
|
||||
'goal': self.goal
|
||||
}
|
||||
|
||||
return observation, reward, done, info
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
render_mode = "human" # "human" or "partial" or "final"
|
||||
# env = ALRHopperJumpEnv()
|
||||
|
Loading…
Reference in New Issue
Block a user