From 9fce6fff42f0c6f3dbe5c471e5ee3881b63fe7a4 Mon Sep 17 00:00:00 2001 From: Dominik Roth Date: Sun, 28 Jan 2024 12:32:52 +0100 Subject: [PATCH] Ported new HopperJump Rew to master --- .../envs/mujoco/hopper_jump/hopper_jump.py | 170 ++++++++++-------- 1 file changed, 97 insertions(+), 73 deletions(-) diff --git a/fancy_gym/envs/mujoco/hopper_jump/hopper_jump.py b/fancy_gym/envs/mujoco/hopper_jump/hopper_jump.py index ae431ab..96dd3a3 100644 --- a/fancy_gym/envs/mujoco/hopper_jump/hopper_jump.py +++ b/fancy_gym/envs/mujoco/hopper_jump/hopper_jump.py @@ -262,76 +262,100 @@ class HopperJumpEnv(HopperEnvCustomXML): return True return False -# # TODO is that needed? if so test it -# class HopperJumpStepEnv(HopperJumpEnv): -# -# def __init__(self, -# xml_file='hopper_jump.xml', -# forward_reward_weight=1.0, -# ctrl_cost_weight=1e-3, -# healthy_reward=1.0, -# height_weight=3, -# dist_weight=3, -# terminate_when_unhealthy=False, -# healthy_state_range=(-100.0, 100.0), -# healthy_z_range=(0.5, float('inf')), -# healthy_angle_range=(-float('inf'), float('inf')), -# reset_noise_scale=5e-3, -# exclude_current_positions_from_observation=False -# ): -# -# self._height_weight = height_weight -# self._dist_weight = dist_weight -# super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, terminate_when_unhealthy, -# healthy_state_range, healthy_z_range, healthy_angle_range, reset_noise_scale, -# exclude_current_positions_from_observation) -# -# def step(self, action): -# self._steps += 1 -# -# self.do_simulation(action, self.frame_skip) -# -# height_after = self.get_body_com("torso")[2] -# site_pos_after = self.data.site('foot_site').xpos.copy() -# self.max_height = max(height_after, self.max_height) -# -# ctrl_cost = self.control_cost(action) -# healthy_reward = self.healthy_reward -# height_reward = self._height_weight * height_after -# goal_dist = np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0])) -# goal_dist_reward = -self._dist_weight * goal_dist -# dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward) -# -# rewards = dist_reward + healthy_reward -# costs = ctrl_cost -# done = False -# -# # This is only for logging the distance to goal when first having the contact -# has_floor_contact = self._is_floor_foot_contact() if not self.contact_with_floor else False -# -# if not self.init_floor_contact: -# self.init_floor_contact = has_floor_contact -# if self.init_floor_contact and not self.has_left_floor: -# self.has_left_floor = not has_floor_contact -# if not self.contact_with_floor and self.has_left_floor: -# self.contact_with_floor = has_floor_contact -# -# if self.contact_dist is None and self.contact_with_floor: -# self.contact_dist = goal_dist -# -# ############################################################## -# -# observation = self._get_obs() -# reward = rewards - costs -# info = { -# 'height': height_after, -# 'x_pos': site_pos_after, -# 'max_height': copy.copy(self.max_height), -# 'goal': copy.copy(self.goal), -# 'goal_dist': goal_dist, -# 'height_rew': height_reward, -# 'healthy_reward': healthy_reward, -# 'healthy': copy.copy(self.is_healthy), -# 'contact_dist': copy.copy(self.contact_dist) or 0 -# } -# return observation, reward, done, info +class HopperJumpMarkovRew(HopperJumpEnv): + def step(self, action): + self._steps += 1 + + self.do_simulation(action, self.frame_skip) + + height_after = self.get_body_com("torso")[2] + # site_pos_after = self.data.get_site_xpos('foot_site') + site_pos_after = self.data.site('foot_site').xpos + self.max_height = max(height_after, self.max_height) + + has_floor_contact = self._is_floor_foot_contact() if not self.contact_with_floor else False + + if not self.init_floor_contact: + self.init_floor_contact = has_floor_contact + if self.init_floor_contact and not self.has_left_floor: + self.has_left_floor = not has_floor_contact + if not self.contact_with_floor and self.has_left_floor: + self.contact_with_floor = has_floor_contact + + ctrl_cost = self.control_cost(action) + costs = ctrl_cost + terminated = False + truncated = False + + goal_dist = np.linalg.norm(site_pos_after - self.goal) + if self.contact_dist is None and self.contact_with_floor: + self.contact_dist = goal_dist + + rewards = 0 + if not self.sparse or (self.sparse and self._steps >= MAX_EPISODE_STEPS_HOPPERJUMP): + healthy_reward = self.healthy_reward + distance_reward = -goal_dist * self._dist_weight + height_reward = (self.max_height if self.sparse else height_after) * self._height_weight + contact_reward = -(self.contact_dist or 5) * self._contact_weight + rewards = self._forward_reward_weight * (distance_reward + height_reward + contact_reward + healthy_reward) + + observation = self._get_obs() + + # While loop to simulate the process after jump to make the task Markovian + if self.sparse and self.has_left_floor: + while self._steps < MAX_EPISODE_STEPS_HOPPERJUMP: + # Simulate to the end of the episode + self._steps += 1 + + try: + self.do_simulation(np.zeros_like(action), self.frame_skip) + except Exception as e: + print(e) + + height_after = self.get_body_com("torso")[2] + #site_pos_after = self.data.get_site_xpos('foot_site') + site_pos_after = self.data.site('foot_site').xpos + self.max_height = max(height_after, self.max_height) + + has_floor_contact = self._is_floor_foot_contact() if not self.contact_with_floor else False + + if not self.init_floor_contact: + self.init_floor_contact = has_floor_contact + if self.init_floor_contact and not self.has_left_floor: + self.has_left_floor = not has_floor_contact + if not self.contact_with_floor and self.has_left_floor: + self.contact_with_floor = has_floor_contact + + ctrl_cost = self.control_cost(action) + costs = ctrl_cost + done = False + + goal_dist = np.linalg.norm(site_pos_after - self.goal) + if self.contact_dist is None and self.contact_with_floor: + self.contact_dist = goal_dist + + rewards = 0 + + # Task has reached the end, compute the sparse reward + done = True + healthy_reward = self.healthy_reward + distance_reward = -goal_dist * self._dist_weight + height_reward = (self.max_height if self.sparse else height_after) * self._height_weight + contact_reward = -(self.contact_dist or 5) * self._contact_weight + rewards = self._forward_reward_weight * (distance_reward + height_reward + contact_reward + healthy_reward) + + reward = rewards - costs + info = dict( + height=height_after, + x_pos=site_pos_after, + max_height=self.max_height, + goal=self.goal[:1], + goal_dist=goal_dist, + height_rew=self.max_height, + healthy_reward=self.healthy_reward, + healthy=self.is_healthy, + contact_dist=self.contact_dist or 0, + num_steps=self._steps, + has_left_floor=self.has_left_floor + ) + return observation, reward, terminated, truncated, info