update bp step based env

2022-06-04 17:43:35 +02:00 · 2022-06-04 17:43:35 +02:00 · 719b40c4e4
commit 719b40c4e4
parent 8b8be4b582
3 changed files with 36 additions and 2 deletions
--- a/alr_envs/alr/init.py
+++ b/alr_envs/alr/init.py
@ -446,6 +446,17 @@ register(
            "frame_skip": 2
        }
    )
+# Beerpong with episodic reward, but fixed release time step
+register(
+        id='ALRBeerPong-v3',
+        entry_point='alr_envs.alr.mujoco:ALRBeerBongEnvStepBasedEpisodicReward',
+        max_episode_steps=300,
+        kwargs={
+            "rndm_goal": True,
+            "cup_goal_pos": [-0.3, -1.2],
+            "frame_skip": 2
+        }
+    )

 # Motion Primitive Environments

--- a/alr_envs/alr/mujoco/init.py
+++ b/alr_envs/alr/mujoco/init.py
@ -2,7 +2,7 @@ from .reacher.balancing import BalancingEnv
 from .ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv
 from .ball_in_a_cup.biac_pd import ALRBallInACupPDEnv
 from .table_tennis.tt_gym import TTEnvGym
-from .beerpong.beerpong import ALRBeerBongEnv, ALRBeerBongEnvStepBased
+from .beerpong.beerpong import ALRBeerBongEnv, ALRBeerBongEnvStepBased, ALRBeerBongEnvStepBasedEpisodicReward
 from .ant_jump.ant_jump import ALRAntJumpEnv
 from .half_cheetah_jump.half_cheetah_jump import ALRHalfCheetahJumpEnv
 from .hopper_jump.hopper_jump import ALRHopperJumpEnv, ALRHopperJumpRndmPosEnv, ALRHopperXYJumpEnv, ALRHopperXYJumpEnvStepBased
--- a/alr_envs/alr/mujoco/beerpong/beerpong.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong.py
@ -186,6 +186,26 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
        return super(ALRBeerBongEnv, self).dt*self.repeat_action


+class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
+    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
+        super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
+        self.release_step = 62  # empirically evaluated for frame_skip=2!
+
+    def step(self, a):
+        if self._steps < self.release_step:
+            return super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(a)
+        else:
+            reward = 0
+            done = False
+            while not done:
+                sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape))
+                reward += sub_reward
+            infos = sub_infos
+            ob = sub_ob
+            ob[-1] = self.release_step + 1     # Since we simulate until the end of the episode, PPO does not see the
+                                               # internal steps and thus, the observation also needs to be set correctly
+        return ob, reward, done, infos
+
 class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
        super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
@ -230,13 +250,16 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
                    reward = reward - sub_infos['action_cost'] + dist_rew
            infos = sub_infos
            ob = sub_ob
+            ob[-1] = self.release_step + 1  # Since we simulate until the end of the episode, PPO does not see the
+            # internal steps and thus, the observation also needs to be set correctly
        return ob, reward, done, infos



 if __name__ == "__main__":
    # env = ALRBeerBongEnv(rndm_goal=True)
-    env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True)
+    # env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True)
+    env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True)
    import time
    env.reset()
    env.render("human")