after deadline

2022-06-21 17:15:01 +02:00 · 2022-06-21 17:15:01 +02:00 · 7bd9848c31
commit 7bd9848c31
parent c47845c0dd
8 changed files with 227 additions and 45 deletions
--- a/alr_envs/alr/init.py
+++ b/alr_envs/alr/init.py
@ -203,6 +203,34 @@ register(
    }
 )

+_vs = np.arange(101).tolist() + [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 5e-1]
+for i in _vs:
+    _env_id = f'ALRReacher{i}-v0'
+    register(
+        id=_env_id,
+        entry_point='alr_envs.alr.mujoco:ALRReacherEnv',
+        max_episode_steps=200,
+        kwargs={
+            "steps_before_reward": 0,
+            "n_links": 5,
+            "balance": False,
+            'ctrl_cost_weight': i
+        }
+    )
+
+    _env_id = f'ALRReacherSparse{i}-v0'
+    register(
+        id=_env_id,
+        entry_point='alr_envs.alr.mujoco:ALRReacherEnv',
+        max_episode_steps=200,
+        kwargs={
+            "steps_before_reward": 200,
+            "n_links": 5,
+            "balance": False,
+            'ctrl_cost_weight': i
+        }
+    )
+
 # CtxtFree are v0, Contextual are v1
 register(
    id='ALRAntJump-v0',
@ -458,6 +486,18 @@ register(
        }
    )

+# Beerpong with episodic reward, but fixed release time step
+register(
+        id='ALRBeerPong-v4',
+        entry_point='alr_envs.alr.mujoco:ALRBeerBongEnvFixedReleaseStep',
+        max_episode_steps=300,
+        kwargs={
+            "rndm_goal": True,
+            "cup_goal_pos": [-0.3, -1.2],
+            "frame_skip": 2
+        }
+    )
+
 # Motion Primitive Environments

 ## Simple Reacher
@ -648,6 +688,56 @@ for _v in _versions:
    )
    ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)

+
+_vs = np.arange(101).tolist() + [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 5e-1]
+for i in _vs:
+    _env_id = f'ALRReacher{i}ProMP-v0'
+    register(
+        id=_env_id,
+        entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper',
+        kwargs={
+            "name": f"alr_envs:{_env_id.replace('ProMP', '')}",
+            "wrappers": [mujoco.reacher.MPWrapper],
+            "mp_kwargs": {
+                "num_dof": 5,
+                "num_basis": 5,
+                "duration": 4,
+                "policy_type": "motor",
+                # "weights_scale": 5,
+                "n_zero_basis": 1,
+                "zero_start": True,
+                "policy_kwargs": {
+                    "p_gains": 1,
+                    "d_gains": 0.1
+                }
+            }
+        }
+    )
+
+    _env_id = f'ALRReacherSparse{i}ProMP-v0'
+    register(
+        id=_env_id,
+        entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper',
+        kwargs={
+            "name": f"alr_envs:{_env_id.replace('ProMP', '')}",
+            "wrappers": [mujoco.reacher.MPWrapper],
+            "mp_kwargs": {
+                "num_dof": 5,
+                "num_basis": 5,
+                "duration": 4,
+                "policy_type": "motor",
+                # "weights_scale": 5,
+                "n_zero_basis": 1,
+                "zero_start": True,
+                "policy_kwargs": {
+                    "p_gains": 1,
+                    "d_gains": 0.1
+                }
+            }
+        }
+    )
+
+
 # ## Beerpong
 # _versions = ["v0", "v1"]
 # for _v in _versions:
@ -717,6 +807,42 @@ for _v in _versions:
    )
    ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)

+## Beerpong ProMP fixed release
+_env_id = 'BeerpongProMP-v2'
+register(
+    id=_env_id,
+    entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper',
+    kwargs={
+        "name": "alr_envs:ALRBeerPong-v4",
+        "wrappers": [mujoco.beerpong.NewMPWrapper],
+        "ep_wrapper_kwargs": {
+            "weight_scale": 1
+            },
+        "movement_primitives_kwargs": {
+            'movement_primitives_type': 'promp',
+            'action_dim': 7
+            },
+        "phase_generator_kwargs": {
+            'phase_generator_type': 'linear',
+            'delay': 0,
+            'tau': 0.62,     # initial value
+            'learn_tau': False,
+            'learn_delay': False
+            },
+        "controller_kwargs": {
+            'controller_type': 'motor',
+            "p_gains": np.array([1.5, 5, 2.55, 3, 2., 2, 1.25]),
+            "d_gains": np.array([0.02333333, 0.1, 0.0625, 0.08, 0.03, 0.03, 0.0125]),
+            },
+        "basis_generator_kwargs": {
+            'basis_generator_type': 'zero_rbf',
+            'num_basis': 2,
+            'num_basis_zero_start': 2
+            }
+        }
+)
+ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
+
 ## Table Tennis
 ctxt_dim = [2, 4]
 for _v, cd in enumerate(ctxt_dim):
--- a/alr_envs/alr/mujoco/init.py
+++ b/alr_envs/alr/mujoco/init.py
@ -2,7 +2,7 @@ from .reacher.balancing import BalancingEnv
 from .ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv
 from .ball_in_a_cup.biac_pd import ALRBallInACupPDEnv
 from .table_tennis.tt_gym import TTEnvGym
-from .beerpong.beerpong import ALRBeerBongEnv, ALRBeerBongEnvStepBased, ALRBeerBongEnvStepBasedEpisodicReward
+from .beerpong.beerpong import ALRBeerBongEnv, ALRBeerBongEnvStepBased, ALRBeerBongEnvStepBasedEpisodicReward, ALRBeerBongEnvFixedReleaseStep
 from .ant_jump.ant_jump import ALRAntJumpEnv
 from .half_cheetah_jump.half_cheetah_jump import ALRHalfCheetahJumpEnv
 from .hopper_jump.hopper_jump import ALRHopperJumpEnv, ALRHopperJumpRndmPosEnv, ALRHopperXYJumpEnv, ALRHopperXYJumpEnvStepBased
--- a/alr_envs/alr/mujoco/beerpong/beerpong.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong.py
@ -185,6 +185,10 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
    def dt(self):
        return super(ALRBeerBongEnv, self).dt*self.repeat_action

+class ALRBeerBongEnvFixedReleaseStep(ALRBeerBongEnv):
+    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
+        super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
+        self.release_step = 62  # empirically evaluated for frame_skip=2!

 class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
@ -206,6 +210,25 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
                                               # internal steps and thus, the observation also needs to be set correctly
        return ob, reward, done, infos

+
+# class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
+#     def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
+#         super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
+#         self.release_step = 62  # empirically evaluated for frame_skip=2!
+#
+#     def step(self, a):
+#         if self._steps < self.release_step:
+#             return super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(a)
+#         else:
+#             sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBasedEpisodicReward, self).step(np.zeros(a.shape))
+#             reward = sub_reward
+#             infos = sub_infos
+#             ob = sub_ob
+#             ob[-1] = self.release_step + 1     # Since we simulate until the end of the episode, PPO does not see the
+#                                                # internal steps and thus, the observation also needs to be set correctly
+#         return ob, reward, done, infos
+
+
 class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
        super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
@ -259,7 +282,8 @@ class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
 if __name__ == "__main__":
    # env = ALRBeerBongEnv(rndm_goal=True)
    # env = ALRBeerBongEnvStepBased(frame_skip=2, rndm_goal=True)
-    env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True)
+    # env = ALRBeerBongEnvStepBasedEpisodicReward(frame_skip=2, rndm_goal=True)
+    env = ALRBeerBongEnvFixedReleaseStep(frame_skip=2, rndm_goal=True)
    import time
    env.reset()
    env.render("human")
--- a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py
+++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py
@ -123,7 +123,6 @@ class ALRHopperJumpEnv(HopperEnv):
 class ALRHopperXYJumpEnv(ALRHopperJumpEnv):

    def step(self, action):
-
        self._floor_geom_id = self.model.geom_name2id('floor')
        self._foot_geom_id = self.model.geom_name2id('foot_geom')

@ -173,7 +172,8 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
            'goal_dist': goal_dist,
            'height_rew': self.max_height,
            'healthy_reward': self.healthy_reward * 2,
-            'healthy': self.is_healthy
+            'healthy': self.is_healthy,
+            'contact_dist': self.contact_dist if self.contact_dist is not None else 0
        }
        return observation, reward, done, info

@ -194,7 +194,6 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
        rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
        qpos = self.init_qpos + rnd_vec
        qvel = self.init_qvel
-
        self.set_state(qpos, qvel)

        observation = self._get_obs()
@ -207,9 +206,6 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):

    def reset(self):
        super().reset()
-        # self.goal = np.random.uniform(-1.5, 1.5, 1)
-        # self.goal = np.random.uniform(0, 1.5, 1)
-        # self.goal = self.np_random.uniform(0, 1.5, 1)
        self.goal = self.np_random.uniform(0.3, 1.35, 1)[0]
        self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0])
        return self.reset_model()
@ -219,6 +215,16 @@ class ALRHopperXYJumpEnv(ALRHopperJumpEnv):
                    - np.array([self.goal, 0, 0])
        return np.concatenate((super(ALRHopperXYJumpEnv, self)._get_obs(), goal_diff))

+    def set_context(self, context):
+        # context is 4 dimensional
+        qpos = self.init_qpos
+        qvel = self.init_qvel
+        qpos[-3:] = context[:3]
+        self.goal = context[-1]
+        self.set_state(qpos, qvel)
+        self.sim.model.body_pos[self.sim.model.body_name2id('goal_site_body')] = np.array([self.goal, 0, 0])
+        return self._get_obs()
+
 class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):

    def __init__(self,
@ -246,10 +252,6 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
                         reset_noise_scale, exclude_current_positions_from_observation, max_episode_steps)

    def step(self, action):
-        print("")
-        print('height_scale: ', self.height_scale)
-        print('healthy_scale: ', self.healthy_scale)
-        print('dist_scale: ', self.dist_scale)
        self._floor_geom_id = self.model.geom_name2id('floor')
        self._foot_geom_id = self.model.geom_name2id('foot_geom')

@ -268,6 +270,23 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
        reward = -ctrl_cost + healthy_reward + dist_reward
        done = False
        observation = self._get_obs()
+
+
+        ###########################################################
+        # This is only for logging the distance to goal when first having the contact
+        ##########################################################
+        floor_contact = self._contact_checker(self._floor_geom_id,
+                                              self._foot_geom_id) if not self.contact_with_floor else False
+        if not self.init_floor_contact:
+            self.init_floor_contact = floor_contact
+        if self.init_floor_contact and not self.has_left_floor:
+            self.has_left_floor = not floor_contact
+        if not self.contact_with_floor and self.has_left_floor:
+            self.contact_with_floor = floor_contact
+
+        if self.contact_dist is None and self.contact_with_floor:
+            self.contact_dist = np.linalg.norm(self.sim.data.site_xpos[self.model.site_name2id('foot_site')]
+                                               - np.array([self.goal, 0, 0]))
        info = {
            'height': height_after,
            'x_pos': site_pos_after,
@ -275,8 +294,9 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
            'goal': self.goal,
            'goal_dist': goal_dist,
            'height_rew': self.max_height,
-            'healthy_reward': self.healthy_reward * 2,
-            'healthy': self.is_healthy
+            'healthy_reward': self.healthy_reward * self.healthy_reward,
+            'healthy': self.is_healthy,
+            'contact_dist': self.contact_dist if self.contact_dist is not None else 0
        }
        return observation, reward, done, info

@ -361,7 +381,7 @@ if __name__ == '__main__':
    # env = ALRHopperJumpRndmPosEnv()
    obs = env.reset()

-    for k in range(10):
+    for k in range(1000):
        obs = env.reset()
        print('observation :', obs[:])
        for i in range(200):
--- a/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py
+++ b/alr_envs/alr/mujoco/hopper_jump/new_mp_wrapper.py
@ -39,3 +39,7 @@ class NewHighCtxtMPWrapper(NewMPWrapper):
            [True],     # goal
            [False] * 3 # goal diff
        ])
+
+    def set_context(self, context):
+        return self.get_observation_from_step(self.env.env.set_context(context))
+
--- a/alr_envs/alr/mujoco/reacher/alr_reacher.py
+++ b/alr_envs/alr/mujoco/reacher/alr_reacher.py
@ -8,7 +8,8 @@ import alr_envs.utils.utils as alr_utils


 class ALRReacherEnv(MujocoEnv, utils.EzPickle):
-    def __init__(self, steps_before_reward=200, n_links=5, balance=False):
+    def __init__(self, steps_before_reward: int = 200, n_links: int = 5, ctrl_cost_weight: int = 1,
+                 balance: bool = False):
        utils.EzPickle.__init__(**locals())

        self._steps = 0
@ -17,6 +18,7 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle):

        self.balance = balance
        self.balance_weight = 1.0
+        self.ctrl_cost_weight = ctrl_cost_weight

        self.reward_weight = 1
        if steps_before_reward == 200:
@ -40,7 +42,7 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle):
        angular_vel = 0.0
        reward_balance = 0.0
        is_delayed = self.steps_before_reward > 0
-        reward_ctrl = - np.square(a).sum()
+        reward_ctrl = - np.square(a).sum() * self.ctrl_cost_weight
        if self._steps >= self.steps_before_reward:
            vec = self.get_body_com("fingertip") - self.get_body_com("target")
            reward_dist -= self.reward_weight * np.linalg.norm(vec)
@ -48,9 +50,9 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle):
                # avoid giving this penalty for normal step based case
                # angular_vel -= 10 * np.linalg.norm(self.sim.data.qvel.flat[:self.n_links])
                angular_vel -= 10 * np.square(self.sim.data.qvel.flat[:self.n_links]).sum()
-        if is_delayed:
-            # Higher control penalty for sparse reward per timestep
-            reward_ctrl *= 10
+        # if is_delayed:
+        #     # Higher control penalty for sparse reward per timestep
+        #     reward_ctrl *= 10

        if self.balance:
            reward_balance -= self.balance_weight * np.abs(
@ -68,35 +70,42 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle):
    def viewer_setup(self):
        self.viewer.cam.trackbodyid = 0

-    def reset_model(self):
-        qpos = self.init_qpos
-        if not hasattr(self, "goal"):
-            self.goal = np.array([-0.25, 0.25])
-            # self.goal = self.init_qpos.copy()[:2] + 0.05
-        qpos[-2:] = self.goal
-        qvel = self.init_qvel
-        qvel[-2:] = 0
-        self.set_state(qpos, qvel)
-        self._steps = 0
-
-        return self._get_obs()
-
    # def reset_model(self):
-    #     qpos = self.init_qpos.copy()
-    #     while True:
-    #         self.goal = self.np_random.uniform(low=-self.n_links / 10, high=self.n_links / 10, size=2)
-    #         # self.goal = self.np_random.uniform(low=0, high=self.n_links / 10, size=2)
-    #         # self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=[0, self.n_links / 10], size=2)
-    #         if np.linalg.norm(self.goal) < self.n_links / 10:
-    #             break
+    #     qpos = self.init_qpos
+    #     if not hasattr(self, "goal"):
+    #         self.goal = np.array([-0.25, 0.25])
+    #         # self.goal = self.init_qpos.copy()[:2] + 0.05
    #     qpos[-2:] = self.goal
-    #     qvel = self.init_qvel.copy()
+    #     qvel = self.init_qvel
    #     qvel[-2:] = 0
    #     self.set_state(qpos, qvel)
    #     self._steps = 0
    #
    #     return self._get_obs()

+    def reset_model(self):
+        qpos = self.init_qpos.copy()
+        while True:
+            # full space
+            # self.goal = self.np_random.uniform(low=-self.n_links / 10, high=self.n_links / 10, size=2)
+            # I Quadrant
+            # self.goal = self.np_random.uniform(low=0, high=self.n_links / 10, size=2)
+            # II Quadrant
+            # self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=[0, self.n_links / 10], size=2)
+            # II + III Quadrant
+            # self.goal = np.random.uniform(low=-self.n_links / 10, high=[0, self.n_links / 10], size=2)
+            # I + II Quadrant
+            self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=self.n_links, size=2)
+            if np.linalg.norm(self.goal) < self.n_links / 10:
+                break
+        qpos[-2:] = self.goal
+        qvel = self.init_qvel.copy()
+        qvel[-2:] = 0
+        self.set_state(qpos, qvel)
+        self._steps = 0
+
+        return self._get_obs()
+
    # def reset_model(self):
    #     qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos
    #     while True:
--- a/alr_envs/utils/make_env_helpers.py
+++ b/alr_envs/utils/make_env_helpers.py
@ -98,7 +98,6 @@ def make(env_id: str, seed, **kwargs):

    return env

-
 def _make_wrapped_env(env_id: str, wrappers: Iterable[Type[gym.Wrapper]], mp: MPInterface, controller: BaseController,
                      ep_wrapper_kwargs: Mapping, seed=1, **kwargs):
    """