From 719b40c4e48d92dbdd965e7ea4e17b299c12f9f4 Mon Sep 17 00:00:00 2001 From: Onur Date: Sat, 4 Jun 2022 17:43:35 +0200 Subject: [PATCH] update bp step based env --- alr_envs/alr/__init__.py | 11 +++++++++++ alr_envs/alr/mujoco/__init__.py | 2 +- alr_envs/alr/mujoco/beerpong/beerpong.py | 25 +++++++++++++++++++++++- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index bf2ae74..cc9768a 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -446,6 +446,17 @@ register( "frame_skip": 2 } ) +# Beerpong with episodic reward, but fixed release time step +register( + id='ALRBeerPong-v3', + entry_point='alr_envs.alr.mujoco:ALRBeerBongEnvStepBasedEpisodicReward', + max_episode_steps=300, + kwargs={ + "rndm_goal": True, + "cup_goal_pos": [-0.3, -1.2], + "frame_skip": 2 + } + ) # Motion Primitive Environments diff --git a/alr_envs/alr/mujoco/__init__.py b/alr_envs/alr/mujoco/__init__.py index 1cde867..2b19331 100644 --- a/alr_envs/alr/mujoco/__init__.py +++ b/alr_envs/alr/mujoco/__init__.py @@ -2,7 +2,7 @@ from .reacher.balancing import BalancingEnv from .ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv from .ball_in_a_cup.biac_pd import ALRBallInACupPDEnv from .table_tennis.tt_gym import TTEnvGym -from .beerpong.beerpong import ALRBeerBongEnv, ALRBeerBongEnvStepBased +from .beerpong.beerpong import ALRBeerBongEnv, ALRBeerBongEnvStepBased, ALRBeerBongEnvStepBasedEpisodicReward from .ant_jump.ant_jump import ALRAntJumpEnv from .half_cheetah_jump.half_cheetah_jump import ALRHalfCheetahJumpEnv from .hopper_jump.hopper_jump import ALRHopperJumpEnv, ALRHopperJumpRndmPosEnv, ALRHopperXYJumpEnv, ALRHopperXYJumpEnvStepBased diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py index c72220b..9028bd0 100644 --- a/alr_envs/alr/mujoco/beerpong/beerpong.py +++ b/alr_envs/alr/mujoco/beerpong/beerpong.py @@ -186,6 +186,26 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle): return super(ALRBeerBongEnv, self).dt*self.repeat_action +class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv): + def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): + super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) + self.release_step = 62 # empirically evaluated for frame_skip=2! + + def step(self, a): + if self._steps < self.release_step: + return super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(a) + else: + reward = 0 + done = False + while not done: + sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape)) + reward += sub_reward + infos = sub_infos + ob = sub_ob + ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the + # internal steps and thus, the observation also needs to be set correctly + return ob, reward, done, infos + class ALRBeerBongEnvStepBased(ALRBeerBongEnv): def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None): super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos) @@ -230,13 +250,16 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv): reward = reward - sub_infos['action_cost'] + dist_rew infos = sub_infos ob = sub_ob + ob[-1] = self.release_step + 1 # Since we simulate until the end of the episode, PPO does not see the + # internal steps and thus, the observation also needs to be set correctly return ob, reward, done, infos if __name__ == "__main__": # env = ALRBeerBongEnv(rndm_goal=True) - env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True) + # env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True) + env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True) import time env.reset() env.render("human")