From d313795cec6f18a3c6e694b54c7863efd2c92ae2 Mon Sep 17 00:00:00 2001 From: Fabian Date: Thu, 7 Apr 2022 14:40:43 +0200 Subject: [PATCH] reacher adjustments --- alr_envs/alr/__init__.py | 12 +- alr_envs/alr/mujoco/reacher/alr_reacher.py | 60 ++++++---- .../mujoco/reacher/assets/reacher_5links.xml | 107 +++++++++--------- alr_envs/alr/mujoco/reacher/mp_wrapper.py | 22 +++- alr_envs/examples/pd_control_gain_tuning.py | 38 +++++-- 5 files changed, 141 insertions(+), 98 deletions(-) diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index 53f292c..8a7140d 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -357,13 +357,13 @@ for _v in _versions: "num_basis": 5, "duration": 2, "policy_type": "velocity", - "weights_scale": 0.1, + "weights_scale": 5, "zero_start": True } } ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) - + ## ALRReacher _versions = ["ALRReacher-v0", "ALRLongReacher-v0", "ALRReacherSparse-v0", "ALRLongReacherSparse-v0"] for _v in _versions: @@ -378,12 +378,12 @@ for _v in _versions: "wrappers": [mujoco.reacher.MPWrapper], "mp_kwargs": { "num_dof": 5 if "long" not in _v.lower() else 7, - "num_basis": 5, + "num_basis": 2, "duration": 4, "alpha_phase": 2, "learn_goal": True, "policy_type": "motor", - "weights_scale": 1, + "weights_scale": 5, "policy_kwargs": { "p_gains": 1, "d_gains": 0.1 @@ -402,10 +402,10 @@ for _v in _versions: "wrappers": [mujoco.reacher.MPWrapper], "mp_kwargs": { "num_dof": 5 if "long" not in _v.lower() else 7, - "num_basis": 5, + "num_basis": 1, "duration": 4, "policy_type": "motor", - "weights_scale": 1, + "weights_scale": 5, "zero_start": True, "policy_kwargs": { "p_gains": 1, diff --git a/alr_envs/alr/mujoco/reacher/alr_reacher.py b/alr_envs/alr/mujoco/reacher/alr_reacher.py index e21eaed..c2b5f18 100644 --- a/alr_envs/alr/mujoco/reacher/alr_reacher.py +++ b/alr_envs/alr/mujoco/reacher/alr_reacher.py @@ -44,9 +44,9 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle): reward_dist -= self.reward_weight * np.linalg.norm(vec) if self.steps_before_reward > 0: # avoid giving this penalty for normal step based case - angular_vel -= np.linalg.norm(self.sim.data.qvel.flat[:self.n_links]) - # angular_vel -= np.square(self.sim.data.qvel.flat[:self.n_links]).sum() - reward_ctrl = - np.square(a).sum() + # angular_vel -= 10 * np.linalg.norm(self.sim.data.qvel.flat[:self.n_links]) + angular_vel -= 10 * np.square(self.sim.data.qvel.flat[:self.n_links]).sum() + reward_ctrl = - 10 * np.square(a).sum() if self.balance: reward_balance -= self.balance_weight * np.abs( @@ -64,6 +64,35 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle): def viewer_setup(self): self.viewer.cam.trackbodyid = 0 + # def reset_model(self): + # qpos = self.init_qpos + # if not hasattr(self, "goal"): + # self.goal = np.array([-0.25, 0.25]) + # # self.goal = self.init_qpos.copy()[:2] + 0.05 + # qpos[-2:] = self.goal + # qvel = self.init_qvel + # qvel[-2:] = 0 + # self.set_state(qpos, qvel) + # self._steps = 0 + # + # return self._get_obs() + + def reset_model(self): + qpos = self.init_qpos.copy() + while True: + self.goal = self.np_random.uniform(low=-self.n_links / 10, high=self.n_links / 10, size=2) + # self.goal = self.np_random.uniform(low=0, high=self.n_links / 10, size=2) + # self.goal = np.random.uniform(low=[-self.n_links / 10, 0], high=[0, self.n_links / 10], size=2) + if np.linalg.norm(self.goal) < self.n_links / 10: + break + qpos[-2:] = self.goal + qvel = self.init_qvel.copy() + qvel[-2:] = 0 + self.set_state(qpos, qvel) + self._steps = 0 + + return self._get_obs() + # def reset_model(self): # qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos # while True: @@ -78,30 +107,15 @@ class ALRReacherEnv(MujocoEnv, utils.EzPickle): # # return self._get_obs() - def reset_model(self): - qpos = self.init_qpos - if not hasattr(self, "goal"): - while True: - self.goal = self.np_random.uniform(low=-self.n_links / 10, high=self.n_links / 10, size=2) - if np.linalg.norm(self.goal) < self.n_links / 10: - break - qpos[-2:] = self.goal - qvel = self.init_qvel - qvel[-2:] = 0 - self.set_state(qpos, qvel) - self._steps = 0 - - return self._get_obs() - def _get_obs(self): theta = self.sim.data.qpos.flat[:self.n_links] + target = self.get_body_com("target") return np.concatenate([ np.cos(theta), np.sin(theta), - self.sim.data.qpos.flat[self.n_links:], # this is goal position - self.sim.data.qvel.flat[:self.n_links], # this is angular velocity - self.get_body_com("fingertip") - self.get_body_com("target"), - # self.get_body_com("target"), # only return target to make problem harder + target[:2], # x-y of goal position + self.sim.data.qvel.flat[:self.n_links], # angular velocity + self.get_body_com("fingertip") - target, # goal distance [self._steps], ]) @@ -122,4 +136,4 @@ if __name__ == '__main__': if d: env.reset() - env.close() \ No newline at end of file + env.close() diff --git a/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml b/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml index 07be257..25a3208 100644 --- a/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml +++ b/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml @@ -1,54 +1,57 @@ - - - - - - \ No newline at end of file diff --git a/alr_envs/alr/mujoco/reacher/mp_wrapper.py b/alr_envs/alr/mujoco/reacher/mp_wrapper.py index 027a2e2..abcdc50 100644 --- a/alr_envs/alr/mujoco/reacher/mp_wrapper.py +++ b/alr_envs/alr/mujoco/reacher/mp_wrapper.py @@ -9,15 +9,27 @@ class MPWrapper(MPEnvWrapper): @property def active_obs(self): return np.concatenate([ - [True] * self.n_links, # cos - [True] * self.n_links, # sin + [False] * self.n_links, # cos + [False] * self.n_links, # sin [True] * 2, # goal position - [True] * self.n_links, # angular velocity - [True] * 3, # goal distance + [False] * self.n_links, # angular velocity + [False] * 3, # goal distance # self.get_body_com("target"), # only return target to make problem harder - [False], # step + [False], # step ]) + # @property + # def active_obs(self): + # return np.concatenate([ + # [True] * self.n_links, # cos, True + # [True] * self.n_links, # sin, True + # [True] * 2, # goal position + # [True] * self.n_links, # angular velocity, True + # [True] * 3, # goal distance + # # self.get_body_com("target"), # only return target to make problem harder + # [False], # step + # ]) + @property def current_vel(self) -> Union[float, int, np.ndarray]: return self.sim.data.qvel.flat[:self.n_links] diff --git a/alr_envs/examples/pd_control_gain_tuning.py b/alr_envs/examples/pd_control_gain_tuning.py index bdcaa41..eff432a 100644 --- a/alr_envs/examples/pd_control_gain_tuning.py +++ b/alr_envs/examples/pd_control_gain_tuning.py @@ -12,34 +12,38 @@ def visualize(env): plt.plot(t, pos_features) plt.show() + # This might work for some environments, however, please verify either way the correct trajectory information # for your environment are extracted below SEED = 1 # env_id = "ball_in_cup-catch" env_id = "ALRReacherSparse-v0" +env_id = "button-press-v2" wrappers = [mujoco.reacher.MPWrapper] +wrappers = [meta.goal_object_change_mp_wrapper.MPWrapper] mp_kwargs = { - "num_dof": 5, - "num_basis": 8, - "duration": 4, - "policy_type": "motor", - "weights_scale": 1, + "num_dof": 4, + "num_basis": 5, + "duration": 6.25, + "policy_type": "metaworld", + "weights_scale": 10, "zero_start": True, - "policy_kwargs": { - "p_gains": 1, - "d_gains": 0.1 - } + # "policy_kwargs": { + # "p_gains": 1, + # "d_gains": 0.1 + # } } # kwargs = dict(time_limit=4, episode_length=200) kwargs = {} env = make_promp_env(env_id, wrappers, seed=SEED, mp_kwargs=mp_kwargs, **kwargs) +env.action_space.seed(SEED) # Plot difference between real trajectory and target MP trajectory env.reset() -w = env.action_space.sample() * 10 +w = env.action_space.sample() # N(0,1) visualize(env) pos, vel = env.mp_rollout(w) @@ -48,14 +52,24 @@ actual_pos = np.zeros((len(pos), *base_shape)) actual_vel = np.zeros((len(pos), *base_shape)) act = np.zeros((len(pos), *base_shape)) +plt.ion() +fig = plt.figure() +ax = fig.add_subplot(1, 1, 1) +img = ax.imshow(env.env.render("rgb_array")) +fig.show() + for t, pos_vel in enumerate(zip(pos, vel)): actions = env.policy.get_action(pos_vel[0], pos_vel[1]) actions = np.clip(actions, env.full_action_space.low, env.full_action_space.high) _, _, _, _ = env.env.step(actions) + if t % 15 == 0: + img.set_data(env.env.render("rgb_array")) + fig.canvas.draw() + fig.canvas.flush_events() act[t, :] = actions # TODO verify for your environment actual_pos[t, :] = env.current_pos - actual_vel[t, :] = env.current_vel + actual_vel[t, :] = 0 # env.current_vel plt.figure(figsize=(15, 5)) @@ -79,7 +93,7 @@ plt.plot(vel, c='C1', label="MP") plt.xlabel("Episode steps") plt.subplot(133) -plt.title("Actions") +plt.title(f"Actions {np.std(act, axis=0)}") plt.plot(act, c="C0"), # label=[f"actions" if i == 0 else "" for i in range(np.prod(base_action_shape))]) plt.xlabel("Episode steps") # plt.legend()