started cleaning up init. DMP envs are still not transferred. Wrappers for various environments still missing

This commit is contained in:
Onur 2022-06-28 20:33:19 +02:00
parent 9ad6fbe712
commit 8fe6a83271
3 changed files with 418 additions and 920 deletions

File diff suppressed because it is too large Load Diff

View File

@ -178,8 +178,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
[self._steps],
])
def compute_reward(self):
@property
def dt(self):
return super(ALRBeerBongEnv, self).dt * self.repeat_action
@ -213,37 +211,37 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
return ob, reward, done, infos
class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
self.release_step = 62 # empirically evaluated for frame_skip=2!
def step(self, a):
if self._steps < self.release_step:
return super(ALRBeerBongEnvStepBased, self).step(a)
else:
reward = 0
done = False
while not done:
sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBased, self).step(np.zeros(a.shape))
if not done or sub_infos['sim_crash']:
reward += sub_reward
else:
ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
cup_goal_dist_final = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
self.sim.model._site_name2id["cup_goal_final_table"]].copy())
cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
self.sim.model._site_name2id["cup_goal_table"]].copy())
if sub_infos['success']:
dist_rew = -cup_goal_dist_final ** 2
else:
dist_rew = -0.5 * cup_goal_dist_final ** 2 - cup_goal_dist_top ** 2
reward = reward - sub_infos['action_cost'] + dist_rew
infos = sub_infos
ob = sub_ob
ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
# internal steps and thus, the observation also needs to be set correctly
return ob, reward, done, infos
# class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
# def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
# super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
# self.release_step = 62 # empirically evaluated for frame_skip=2!
#
# def step(self, a):
# if self._steps < self.release_step:
# return super(ALRBeerBongEnvStepBased, self).step(a)
# else:
# reward = 0
# done = False
# while not done:
# sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBased, self).step(np.zeros(a.shape))
# if not done or sub_infos['sim_crash']:
# reward += sub_reward
# else:
# ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
# cup_goal_dist_final = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
# self.sim.model._site_name2id["cup_goal_final_table"]].copy())
# cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
# self.sim.model._site_name2id["cup_goal_table"]].copy())
# if sub_infos['success']:
# dist_rew = -cup_goal_dist_final ** 2
# else:
# dist_rew = -0.5 * cup_goal_dist_final ** 2 - cup_goal_dist_top ** 2
# reward = reward - sub_infos['action_cost'] + dist_rew
# infos = sub_infos
# ob = sub_ob
# ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the
# # internal steps and thus, the observation also needs to be set correctly
# return ob, reward, done, infos
if __name__ == "__main__":

View File

@ -298,76 +298,6 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
return observation, reward, done, info
class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
def __init__(self, max_episode_steps=250):
super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False,
reset_noise_scale=5e-1,
max_episode_steps=max_episode_steps)
def reset_model(self):
self._floor_geom_id = self.model.geom_name2id('floor')
self._foot_geom_id = self.model.geom_name2id('foot_geom')
noise_low = -np.ones(self.model.nq) * self._reset_noise_scale
noise_low[1] = 0
noise_low[2] = 0
noise_low[3] = -0.2
noise_low[4] = -0.2
noise_low[5] = -0.1
noise_high = np.ones(self.model.nq) * self._reset_noise_scale
noise_high[1] = 0
noise_high[2] = 0
noise_high[3] = 0
noise_high[4] = 0
noise_high[5] = 0.1
rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
# rnd_vec[2] *= 0.05 # the angle around the y axis shouldn't be too high as the agent then falls down quickly and
# can not recover
# rnd_vec[1] = np.clip(rnd_vec[1], 0, 0.3)
qpos = self.init_qpos + rnd_vec
qvel = self.init_qvel
self.set_state(qpos, qvel)
observation = self._get_obs()
return observation
def step(self, action):
self.current_step += 1
self.do_simulation(action, self.frame_skip)
self.contact_with_floor = self._contact_checker(self._floor_geom_id, self._foot_geom_id) if not \
self.contact_with_floor else True
height_after = self.get_body_com("torso")[2]
self.max_height = max(height_after, self.max_height) if self.contact_with_floor else 0
ctrl_cost = self.control_cost(action)
costs = ctrl_cost
done = False
if self.current_step >= self.max_episode_steps:
healthy_reward = 0
height_reward = self._forward_reward_weight * self.max_height # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley
rewards = height_reward + healthy_reward
else:
# penalty for wrong start direction of first two joints; not needed, could be removed
rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0
observation = self._get_obs()
reward = rewards - costs
info = {
'height': height_after,
'max_height': self.max_height,
'goal': self.goal
}
return observation, reward, done, info
if __name__ == '__main__':
render_mode = "human" # "human" or "partial" or "final"
# env = ALRHopperJumpEnv()