bp step based -> release time for PPO

2022-06-02 09:05:38 +02:00 · 2022-06-02 09:05:38 +02:00 · 24604e60be
commit 24604e60be
parent 59b15e82ea
5 changed files with 51 additions and 5 deletions
--- a/alr_envs/alr/init.py
+++ b/alr_envs/alr/init.py
@ -435,6 +435,17 @@ register(
        }
    )

+# random goal cup position
+register(
+        id='ALRBeerPong-v2',
+        entry_point='alr_envs.alr.mujoco:ALRBeerBongEnvStepBased',
+        max_episode_steps=300,
+        kwargs={
+            "rndm_goal": True,
+            "cup_goal_pos": [-0.3, -1.2],
+            "frame_skip": 2
+        }
+    )

 # Motion Primitive Environments

--- a/alr_envs/alr/mujoco/init.py
+++ b/alr_envs/alr/mujoco/init.py
@ -2,7 +2,7 @@ from .reacher.balancing import BalancingEnv
 from .ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv
 from .ball_in_a_cup.biac_pd import ALRBallInACupPDEnv
 from .table_tennis.tt_gym import TTEnvGym
-from .beerpong.beerpong import ALRBeerBongEnv
+from .beerpong.beerpong import ALRBeerBongEnv, ALRBeerBongEnvStepBased
 from .ant_jump.ant_jump import ALRAntJumpEnv
 from .half_cheetah_jump.half_cheetah_jump import ALRHalfCheetahJumpEnv
 from .hopper_jump.hopper_jump import ALRHopperJumpEnv, ALRHopperJumpRndmPosEnv, ALRHopperXYJumpEnv, ALRHopperXYJumpEnvStepBased
--- a/alr_envs/alr/mujoco/beerpong/beerpong.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong.py
@ -2,7 +2,7 @@ import mujoco_py.builder
 import os

 import numpy as np
-from gym import utils
+from gym import utils, spaces
 from gym.envs.mujoco import MujocoEnv
 from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward

@ -160,7 +160,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
                     is_collided=is_collided, sim_crash=crash,
                     table_contact_first=int(not self.reward_function.ball_ground_contact_first))
        infos.update(reward_infos)
-
        return ob, reward, done, infos

    def check_traj_in_joint_limits(self):
@ -168,9 +167,16 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):

    def _get_obs(self):
        theta = self.sim.data.qpos.flat[:7]
+        theta_dot = self.sim.data.qvel.flat[:7]
+        ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
+        cup_goal_diff_final = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_final_table"]].copy()
+        cup_goal_diff_top = ball_pos - self.sim.data.site_xpos[self.sim.model._site_name2id["cup_goal_table"]].copy()
        return np.concatenate([
            np.cos(theta),
            np.sin(theta),
+            theta_dot,
+            cup_goal_diff_final,
+            cup_goal_diff_top,
            self.sim.model.body_pos[self.cup_table_id][:2].copy(),
            [self._steps],
        ])
@ -179,14 +185,37 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
    def dt(self):
        return super(ALRBeerBongEnv, self).dt*self.repeat_action

+
+class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
+
+    def _set_action_space(self):
+        bounds = super(ALRBeerBongEnvStepBased, self)._set_action_space()
+        min_bound = np.concatenate(([-1], bounds.low), dtype=bounds.dtype)
+        max_bound = np.concatenate(([1], bounds.high), dtype=bounds.dtype)
+        self.action_space = spaces.Box(low=min_bound, high=max_bound, dtype=bounds.dtype)
+        return self.action_space
+
+    def step(self, a):
+        self.release_step = self._steps if a[0]>=0 and self.release_step >= self._steps else self.release_step
+        return super(ALRBeerBongEnvStepBased, self).step(a[1:])
+
+    def reset(self):
+        ob = super(ALRBeerBongEnvStepBased, self).reset()
+        self.release_step = self.ep_length + 1
+        return ob
+
 if __name__ == "__main__":
-    env = ALRBeerBongEnv(rndm_goal=True)
+    # env = ALRBeerBongEnv(rndm_goal=True)
+    env = ALRBeerBongEnvStepBased(rndm_goal=True)
    import time
    env.reset()
    env.render("human")
    for i in range(1500):
        # ac = 10 * env.action_space.sample()[0:7]
-        ac = np.zeros(7)
+        ac = np.zeros(8)
+        ac[0] = -1
+        if env._steps > 150:
+            ac[0] = 1
        obs, rew, d, info = env.step(ac)
        env.render("human")
        print(env.dt)
--- a/alr_envs/alr/mujoco/beerpong/mp_wrapper.py
+++ b/alr_envs/alr/mujoco/beerpong/mp_wrapper.py
@ -12,6 +12,9 @@ class MPWrapper(MPEnvWrapper):
        return np.hstack([
            [False] * 7,  # cos
            [False] * 7,  # sin
+            [False] * 7, # joint velocities
+            [False] * 3, # cup_goal_diff_final
+            [False] * 3, # cup_goal_diff_top
            [True] * 2,  # xy position of cup
            [False]  # env steps
        ])
--- a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py
+++ b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py
@ -17,6 +17,9 @@ class NewMPWrapper(EpisodicWrapper):
        return np.hstack([
            [False] * 7,  # cos
            [False] * 7,  # sin
+            [False] * 7,  # joint velocities
+            [False] * 3,  # cup_goal_diff_final
+            [False] * 3,  # cup_goal_diff_top
            [True] * 2,  # xy position of cup
            [False]  # env steps
        ])