smaller ctxt range hopper jump + step-based rew for bp

2022-06-04 15:27:20 +02:00 · 2022-06-04 15:27:20 +02:00 · 8b8be4b582
commit 8b8be4b582
parent 24604e60be
4 changed files with 93 additions and 36 deletions
--- a/alr_envs/alr/init.py
+++ b/alr_envs/alr/init.py
@ -253,7 +253,7 @@ register(
    kwargs={
        "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP,
        "context": False,
-        "healthy_rewasrd": 1.0
+        "healthy_reward": 1.0
    }
 )
 register(
@ -405,7 +405,7 @@ register(id='TableTennis2DCtxt-v1',
         kwargs={'ctxt_dim': 2, 'fixed_goal': True})

 register(id='TableTennis4DCtxt-v0',
-         entry_point='alr_envs.alr.mujoco:TTEnvGym',
+         entry_point='alr_envs.alr.mujocco:TTEnvGym',
         max_episode_steps=MAX_EPISODE_STEPS,
         kwargs={'ctxt_dim': 4})

@ -915,7 +915,7 @@ for _v in _versions:
            "basis_generator_kwargs": {
                'basis_generator_type': 'zero_rbf',
                'num_basis': 5,
-                'num_basis_zero_start': 2
+                'num_basis_zero_start': 1
            }
        }
    )
--- a/alr_envs/alr/mujoco/beerpong/beerpong.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong.py
@ -187,35 +187,65 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):


 class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
+    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
+        super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
+        self.release_step = 62  # empirically evaluated for frame_skip=2!

-    def _set_action_space(self):
-        bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space()
-        min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype)
-        max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype)
-        self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype)
-        return self.action_space
+    # def _set_action_space(self):
+    #     bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space()
+    #     min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype)
+    #     max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype)
+    #     self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype)
+    #     return self.action_space
+
+    # def step(self, a):
+    #     self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step
+    #     return super(ALRBeerBongEnvStepBased, self).step(a[1:])
+    #
+    # def reset(self):
+    #     ob = super(ALRBeerBongEnvStepBased, self).reset()
+    #     self.release_step = self.ep_length + 1
+    #     return ob

    def step(self, a):
-        self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step
-        return super(ALRBeerBongEnvStepBased, self).step(a[1:])
+        if self._steps < self.release_step:
+            return super(ALRBeerBongEnvStepBased, self).step(a)
+        else:
+            reward = 0
+            done = False
+            while not done:
+                sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBased, self).step(np.zeros(a.shape))
+                if not done or sub_infos['sim_crash']:
+                    reward += sub_reward
+                else:
+                    ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
+                    cup_goal_dist_final = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
+                        self.sim.model._site_name2id["cup_goal_final_table"]].copy())
+                    cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
+                        self.sim.model._site_name2id["cup_goal_table"]].copy())
+                    if sub_infos['success']:
+                        dist_rew = -cup_goal_dist_final**2
+                    else:
+                        dist_rew = -0.5*cup_goal_dist_final**2 - cup_goal_dist_top**2
+                    reward = reward - sub_infos['action_cost'] + dist_rew
+            infos = sub_infos
+            ob = sub_ob
+        return ob, reward, done, infos
+

-    def reset(self):
-        ob = super(ALRBeerBongEnvStepBased, self).reset()
-        self.release_step = self.ep_length + 1
-        return ob

 if __name__ == "__main__":
    # env = ALRBeerBongEnv(rndm_goal=True)
-    env = ALRBeerBongEnvStepBased(rndm_goal=True)
+    env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True)
    import time
    env.reset()
    env.render("human")
    for i in range(1500):
-        # ac = 10 * env.action_space.sample()[0:7]
-        ac = np.zeros(8)
-        ac[0] = -1
-        if env._steps > 150:
-            ac[0] = 1
+        ac = 10 * env.action_space.sample()
+        # ac = np.zeros(7)
+        # ac[0] = -1
+        # if env._steps > 150:
+        #     ac[0] = 1
        obs, rew, d, info = env.step(ac)
        env.render("human")
        print(env.dt)
--- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
@ -90,7 +90,7 @@ class BeerPongReward:
        self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \
            if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup
        action_cost = np.sum(np.square(action))
-        self.action_costs.append(action_cost)
+        self.action_costs.append(np.copy(action_cost))
        # # ##################### Reward function which does not force to bounce once on the table (quad dist) ############

        self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
@ -110,8 +110,9 @@ class BeerPongReward:
                else:
                    min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0
            # dist_ground_cup = 1 * self.dist_ground_cup
+            action_cost = 1e-4 * np.mean(action_cost)
            reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \
-                     1e-4 * np.mean(action_cost) - ground_contact_dist_coeff*self.dist_ground_cup ** 2
+                     action_cost - ground_contact_dist_coeff*self.dist_ground_cup ** 2
            # 1e-7*np.mean(action_cost)
            # release step punishment
            min_time_bound = 0.1
@ -124,7 +125,8 @@ class BeerPongReward:
            # print('release time :', release_time)
            # print('dist_ground_cup :', dist_ground_cup)
        else:
-            reward = - 1e-2 * action_cost
+            action_cost = 1e-2 * action_cost
+            reward = - action_cost
            # reward = - 1e-4 * action_cost
            # reward = 0
            success = False
--- a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py
+++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py
@ -80,7 +80,8 @@ class ALRHopperJumpEnv(HopperEnv):
            'x_pos': site_pos_after,
            'max_height': self.max_height,
            'height_rew': self.max_height,
-            'healthy_reward': self.healthy_reward * 2
+            'healthy_reward': self.healthy_reward * 2,
+            'healthy': self.is_healthy
        }

        return observation, reward, done, info
@ -171,7 +172,8 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
            'goal': self.goal,
            'goal_dist': goal_dist,
            'height_rew': self.max_height,
-            'healthy_reward': self.healthy_reward * 2
+            'healthy_reward': self.healthy_reward * 2,
+            'healthy': self.is_healthy
        }
        return observation, reward, done, info

@ -207,13 +209,38 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
        super().reset()
        # self.goal = np.random.uniform(-1.5, 1.5, 1)
        # self.goal = np.random.uniform(0, 1.5, 1)
-        self.goal = self.np_random.uniform(0, 1.5, 1)
-        # self.goal = np.array([1.5])
+        # self.goal = self.np_random.uniform(0, 1.5, 1)
+        self.goal = self.np_random.uniform(0.3, 1.35, 1)
        self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0], dtype=object)
        return self.reset_model()


 class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
+
+    def __init__(self,
+                 xml_file='hopper_jump.xml',
+                 forward_reward_weight=1.0,
+                 ctrl_cost_weight=1e-3,
+                 healthy_reward=0.0,
+                 penalty=0.0,
+                 context=True,
+                 terminate_when_unhealthy=False,
+                 healthy_state_range=(-100.0, 100.0),
+                 healthy_z_range=(0.5, float('inf')),
+                 healthy_angle_range=(-float('inf'), float('inf')),
+                 reset_noise_scale=5e-3,
+                 exclude_current_positions_from_observation=True,
+                 max_episode_steps=250,
+                 height_scale = 10,
+                 dist_scale = 3,
+                 healthy_scale = 2):
+        self.height_scale = height_scale
+        self.dist_scale = dist_scale
+        self.healthy_scale = healthy_scale
+        super().__init__(xml_file, forward_reward_weight, ctrl_cost_weight, healthy_reward, penalty, context,
+                         terminate_when_unhealthy, healthy_state_range, healthy_z_range, healthy_angle_range,
+                         reset_noise_scale, exclude_current_positions_from_observation, max_episode_steps)
+
    def step(self, action):

        self._floor_geom_id = self.model.geom_name2id('floor')
@ -226,10 +253,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
        self.max_height = max(height_after, self.max_height)
        ctrl_cost = self.control_cost(action)

-        healthy_reward = self.healthy_reward * 2
-        height_reward = 10*height_after
+        healthy_reward = self.healthy_reward * self.healthy_scale
+        height_reward = self.height_scale*height_after
        goal_dist = np.atleast_1d(np.linalg.norm(site_pos_after - np.array([self.goal, 0, 0], dtype=object)))[0]
-        goal_dist_reward = -3*goal_dist
+        goal_dist_reward = -self.dist_scale*goal_dist
        dist_reward = self._forward_reward_weight * (goal_dist_reward + height_reward)
        reward = -ctrl_cost + healthy_reward + dist_reward
        done = False
@ -240,12 +267,10 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
            'max_height': self.max_height,
            'goal': self.goal,
            'goal_dist': goal_dist,
-            'dist_rew': dist_reward,
-            'height_rew': height_reward,
-            'healthy_reward': healthy_reward,
-            'ctrl_reward': -ctrl_cost
+            'height_rew': self.max_height,
+            'healthy_reward': self.healthy_reward * 2,
+            'healthy': self.is_healthy
        }
-
        return observation, reward, done, info

 class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):