started cleaning up init. DMP envs are still not transferred. Wrappers for various environments still missing

2022-06-28 20:33:19 +02:00 · 2022-06-28 20:33:19 +02:00 · 8fe6a83271
commit 8fe6a83271
parent 9ad6fbe712
3 changed files with 418 additions and 920 deletions
--- a/alr_envs/alr/init.py
+++ b/alr_envs/alr/init.py
--- a/alr_envs/alr/mujoco/beerpong/beerpong.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong.py
@ -178,8 +178,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
            [self._steps],
            ])

-    def compute_reward(self):
-
    @property
    def dt(self):
        return super(ALRBeerBongEnv, self).dt * self.repeat_action
@ -213,37 +211,37 @@ class ALRBeerBongEnvStepBasedEpisodicReward(ALRBeerBongEnv):
        return ob, reward, done, infos


-class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
-    def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
-        super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
-        self.release_step = 62  # empirically evaluated for frame_skip=2!
-
-    def step(self, a):
-        if self._steps < self.release_step:
-            return super(ALRBeerBongEnvStepBased, self).step(a)
-        else:
-            reward = 0
-            done = False
-            while not done:
-                sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBased, self).step(np.zeros(a.shape))
-                if not done or sub_infos['sim_crash']:
-                    reward += sub_reward
-                else:
-                    ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
-                    cup_goal_dist_final = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
-                        self.sim.model._site_name2id["cup_goal_final_table"]].copy())
-                    cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
-                        self.sim.model._site_name2id["cup_goal_table"]].copy())
-                    if sub_infos['success']:
-                        dist_rew = -cup_goal_dist_final ** 2
-                    else:
-                        dist_rew = -0.5 * cup_goal_dist_final ** 2 - cup_goal_dist_top ** 2
-                    reward = reward - sub_infos['action_cost'] + dist_rew
-            infos = sub_infos
-            ob = sub_ob
-            ob[-1] = self.release_step + 1  # Since we simulate until the end of the episode, PPO does not see the
-            # internal steps and thus, the observation also needs to be set correctly
-        return ob, reward, done, infos
+# class ALRBeerBongEnvStepBased(ALRBeerBongEnv):
+#     def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False, rndm_goal=False, cup_goal_pos=None):
+#         super().__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
+#         self.release_step = 62  # empirically evaluated for frame_skip=2!
+#
+#     def step(self, a):
+#         if self._steps < self.release_step:
+#             return super(ALRBeerBongEnvStepBased, self).step(a)
+#         else:
+#             reward = 0
+#             done = False
+#             while not done:
+#                 sub_ob, sub_reward, done, sub_infos = super(ALRBeerBongEnvStepBased, self).step(np.zeros(a.shape))
+#                 if not done or sub_infos['sim_crash']:
+#                     reward += sub_reward
+#                 else:
+#                     ball_pos = self.sim.data.body_xpos[self.sim.model._body_name2id["ball"]].copy()
+#                     cup_goal_dist_final = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
+#                         self.sim.model._site_name2id["cup_goal_final_table"]].copy())
+#                     cup_goal_dist_top = np.linalg.norm(ball_pos - self.sim.data.site_xpos[
+#                         self.sim.model._site_name2id["cup_goal_table"]].copy())
+#                     if sub_infos['success']:
+#                         dist_rew = -cup_goal_dist_final ** 2
+#                     else:
+#                         dist_rew = -0.5 * cup_goal_dist_final ** 2 - cup_goal_dist_top ** 2
+#                     reward = reward - sub_infos['action_cost'] + dist_rew
+#             infos = sub_infos
+#             ob = sub_ob
+#             ob[-1] = self.release_step + 1  # Since we simulate until the end of the episode, PPO does not see the
+#             # internal steps and thus, the observation also needs to be set correctly
+#         return ob, reward, done, infos


 if __name__ == "__main__":
--- a/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py
+++ b/alr_envs/alr/mujoco/hopper_jump/hopper_jump.py
@ -298,76 +298,6 @@ class ALRHopperXYJumpEnvStepBased(ALRHopperXYJumpEnv):
        return observation, reward, done, info


-class ALRHopperJumpRndmPosEnv(ALRHopperJumpEnv):
-    def __init__(self, max_episode_steps=250):
-        super(ALRHopperJumpRndmPosEnv, self).__init__(exclude_current_positions_from_observation=False,
-                                                      reset_noise_scale=5e-1,
-                                                      max_episode_steps=max_episode_steps)
-
-    def reset_model(self):
-        self._floor_geom_id = self.model.geom_name2id('floor')
-        self._foot_geom_id = self.model.geom_name2id('foot_geom')
-        noise_low = -np.ones(self.model.nq) * self._reset_noise_scale
-        noise_low[1] = 0
-        noise_low[2] = 0
-        noise_low[3] = -0.2
-        noise_low[4] = -0.2
-        noise_low[5] = -0.1
-
-        noise_high = np.ones(self.model.nq) * self._reset_noise_scale
-        noise_high[1] = 0
-        noise_high[2] = 0
-        noise_high[3] = 0
-        noise_high[4] = 0
-        noise_high[5] = 0.1
-
-        rnd_vec = self.np_random.uniform(low=noise_low, high=noise_high, size=self.model.nq)
-        # rnd_vec[2] *= 0.05  # the angle around the y axis shouldn't be too high as the agent then falls down quickly and
-        # can not recover
-        # rnd_vec[1] = np.clip(rnd_vec[1], 0, 0.3)
-        qpos = self.init_qpos + rnd_vec
-        qvel = self.init_qvel
-
-        self.set_state(qpos, qvel)
-
-        observation = self._get_obs()
-        return observation
-
-    def step(self, action):
-
-        self.current_step += 1
-        self.do_simulation(action, self.frame_skip)
-
-        self.contact_with_floor = self._contact_checker(self._floor_geom_id, self._foot_geom_id) if not \
-            self.contact_with_floor else True
-
-        height_after = self.get_body_com("torso")[2]
-        self.max_height = max(height_after, self.max_height) if self.contact_with_floor else 0
-
-        ctrl_cost = self.control_cost(action)
-        costs = ctrl_cost
-        done = False
-
-        if self.current_step >= self.max_episode_steps:
-            healthy_reward = 0
-            height_reward = self._forward_reward_weight * self.max_height  # maybe move reward calculation into if structure and define two different _forward_reward_weight variables for context and episodic seperatley
-            rewards = height_reward + healthy_reward
-
-        else:
-            # penalty for wrong start direction of first two joints; not needed, could be removed
-            rewards = ((action[:2] > 0) * self.penalty).sum() if self.current_step < 10 else 0
-
-        observation = self._get_obs()
-        reward = rewards - costs
-        info = {
-            'height': height_after,
-            'max_height': self.max_height,
-            'goal': self.goal
-            }
-
-        return observation, reward, done, info
-
-
 if __name__ == '__main__':
    render_mode = "human"  # "human" or "partial" or "final"
    # env = ALRHopperJumpEnv()