diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py index f91e004..552adfa 100644 --- a/alr_envs/alr/__init__.py +++ b/alr_envs/alr/__init__.py @@ -538,16 +538,68 @@ for _v in _versions: "wrappers": [classic_control.hole_reacher.MPWrapper], "mp_kwargs": { "num_dof": 5, - "num_basis": 5, + "num_basis": 3, "duration": 2, "policy_type": "velocity", - "weights_scale": 0.1, + "weights_scale": 5, "zero_start": True } } ) ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) +## ALRReacher +_versions = ["ALRReacher-v0", "ALRLongReacher-v0", "ALRReacherSparse-v0", "ALRLongReacherSparse-v0"] +for _v in _versions: + _name = _v.split("-") + _env_id = f'{_name[0]}DMP-{_name[1]}' + register( + id=_env_id, + entry_point='alr_envs.utils.make_env_helpers:make_dmp_env_helper', + # max_episode_steps=1, + kwargs={ + "name": f"alr_envs:{_v}", + "wrappers": [mujoco.reacher.MPWrapper], + "mp_kwargs": { + "num_dof": 5 if "long" not in _v.lower() else 7, + "num_basis": 2, + "duration": 4, + "alpha_phase": 2, + "learn_goal": True, + "policy_type": "motor", + "weights_scale": 5, + "policy_kwargs": { + "p_gains": 1, + "d_gains": 0.1 + } + } + } + ) + ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["DMP"].append(_env_id) + + _env_id = f'{_name[0]}ProMP-{_name[1]}' + register( + id=_env_id, + entry_point='alr_envs.utils.make_env_helpers:make_promp_env_helper', + kwargs={ + "name": f"alr_envs:{_v}", + "wrappers": [mujoco.reacher.MPWrapper], + "mp_kwargs": { + "num_dof": 5 if "long" not in _v.lower() else 7, + "num_basis": 2, + "duration": 4, + "policy_type": "motor", + "weights_scale": 5, + "zero_start": True, + "policy_kwargs": { + "p_gains": 1, + "d_gains": 0.1 + } + } + } + ) + ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id) + # ## Beerpong # _versions = ["v0", "v1"] # for _v in _versions: diff --git a/alr_envs/alr/classic_control/hole_reacher/hole_reacher.py b/alr_envs/alr/classic_control/hole_reacher/hole_reacher.py index 208f005..883be8c 100644 --- a/alr_envs/alr/classic_control/hole_reacher/hole_reacher.py +++ b/alr_envs/alr/classic_control/hole_reacher/hole_reacher.py @@ -45,6 +45,9 @@ class HoleReacherEnv(BaseReacherDirectEnv): elif rew_fct == "vel_acc": from alr_envs.alr.classic_control.hole_reacher.hr_dist_vel_acc_reward import HolereacherReward self.reward_function = HolereacherReward(allow_self_collision, allow_wall_collision, collision_penalty) + elif rew_fct == "unbounded": + from alr_envs.alr.classic_control.hole_reacher.hr_unbounded_reward import HolereacherReward + self.reward_function = HolereacherReward(allow_self_collision, allow_wall_collision) else: raise ValueError("Unknown reward function {}".format(rew_fct)) diff --git a/alr_envs/alr/classic_control/hole_reacher/hr_unbounded_reward.py b/alr_envs/alr/classic_control/hole_reacher/hr_unbounded_reward.py new file mode 100644 index 0000000..7ed13a1 --- /dev/null +++ b/alr_envs/alr/classic_control/hole_reacher/hr_unbounded_reward.py @@ -0,0 +1,60 @@ +import numpy as np + + +class HolereacherReward: + def __init__(self, allow_self_collision, allow_wall_collision): + + # collision + self.allow_self_collision = allow_self_collision + self.allow_wall_collision = allow_wall_collision + self._is_collided = False + + self.reward_factors = np.array((1, -5e-6)) + + def reset(self): + self._is_collided = False + + def get_reward(self, env): + dist_reward = 0 + success = False + + self_collision = False + wall_collision = False + + if not self.allow_self_collision: + self_collision = env._check_self_collision() + + if not self.allow_wall_collision: + wall_collision = env.check_wall_collision() + + self._is_collided = self_collision or wall_collision + + if env._steps == 180 or self._is_collided: + self.end_eff_pos = np.copy(env.end_effector) + + if env._steps == 199 or self._is_collided: + # return reward only in last time step + # Episode also terminates when colliding, hence return reward + dist = np.linalg.norm(self.end_eff_pos - env._goal) + + if self._is_collided: + dist_reward = 0.25 * np.exp(- dist) + else: + if env.end_effector[1] > 0: + dist_reward = np.exp(- dist) + else: + dist_reward = 1 - self.end_eff_pos[1] + + success = not self._is_collided + + info = {"is_success": success, + "is_collided": self._is_collided, + "end_effector": np.copy(env.end_effector), + "joints": np.copy(env.current_pos)} + + acc_cost = np.sum(env._acc ** 2) + + reward_features = np.array((dist_reward, acc_cost)) + reward = np.dot(reward_features, self.reward_factors) + + return reward, info \ No newline at end of file diff --git a/alr_envs/alr/mujoco/reacher/__init__.py b/alr_envs/alr/mujoco/reacher/__init__.py index e69de29..989b5a9 100644 --- a/alr_envs/alr/mujoco/reacher/__init__.py +++ b/alr_envs/alr/mujoco/reacher/__init__.py @@ -0,0 +1 @@ +from .mp_wrapper import MPWrapper \ No newline at end of file diff --git a/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml b/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml index 07be257..25a3208 100644 --- a/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml +++ b/alr_envs/alr/mujoco/reacher/assets/reacher_5links.xml @@ -1,54 +1,57 @@ - - - - - - \ No newline at end of file diff --git a/alr_envs/alr/mujoco/reacher/mp_wrapper.py b/alr_envs/alr/mujoco/reacher/mp_wrapper.py new file mode 100644 index 0000000..abcdc50 --- /dev/null +++ b/alr_envs/alr/mujoco/reacher/mp_wrapper.py @@ -0,0 +1,43 @@ +from typing import Union + +import numpy as np +from mp_env_api import MPEnvWrapper + + +class MPWrapper(MPEnvWrapper): + + @property + def active_obs(self): + return np.concatenate([ + [False] * self.n_links, # cos + [False] * self.n_links, # sin + [True] * 2, # goal position + [False] * self.n_links, # angular velocity + [False] * 3, # goal distance + # self.get_body_com("target"), # only return target to make problem harder + [False], # step + ]) + + # @property + # def active_obs(self): + # return np.concatenate([ + # [True] * self.n_links, # cos, True + # [True] * self.n_links, # sin, True + # [True] * 2, # goal position + # [True] * self.n_links, # angular velocity, True + # [True] * 3, # goal distance + # # self.get_body_com("target"), # only return target to make problem harder + # [False], # step + # ]) + + @property + def current_vel(self) -> Union[float, int, np.ndarray]: + return self.sim.data.qvel.flat[:self.n_links] + + @property + def current_pos(self) -> Union[float, int, np.ndarray]: + return self.sim.data.qpos.flat[:self.n_links] + + @property + def dt(self) -> Union[float, int]: + return self.env.dt diff --git a/alr_envs/examples/pd_control_gain_tuning.py b/alr_envs/examples/pd_control_gain_tuning.py index 465042c..1683c6f 100644 --- a/alr_envs/examples/pd_control_gain_tuning.py +++ b/alr_envs/examples/pd_control_gain_tuning.py @@ -2,70 +2,98 @@ import numpy as np from matplotlib import pyplot as plt from alr_envs import dmc, meta +from alr_envs.alr import mujoco from alr_envs.utils.make_env_helpers import make_promp_env + +def visualize(env): + t = env.t + pos_features = env.mp.basis_generator.basis(t) + plt.plot(t, pos_features) + plt.show() + + # This might work for some environments, however, please verify either way the correct trajectory information # for your environment are extracted below -SEED = 10 -env_id = "ball_in_cup-catch" -wrappers = [dmc.ball_in_cup.MPWrapper] +SEED = 1 +# env_id = "ball_in_cup-catch" +env_id = "ALRReacherSparse-v0" +env_id = "button-press-v2" +wrappers = [mujoco.reacher.MPWrapper] +wrappers = [meta.goal_object_change_mp_wrapper.MPWrapper] mp_kwargs = { - "num_dof": 2, - "num_basis": 10, - "duration": 2, - "width": 0.025, - "policy_type": "motor", - "weights_scale": 1, + "num_dof": 4, + "num_basis": 5, + "duration": 6.25, + "policy_type": "metaworld", + "weights_scale": 10, "zero_start": True, - "policy_kwargs": { - "p_gains": 1, - "d_gains": 1 - } + # "policy_kwargs": { + # "p_gains": 1, + # "d_gains": 0.1 + # } } -kwargs = dict(time_limit=2, episode_length=100) +# kwargs = dict(time_limit=4, episode_length=200) +kwargs = {} -env = make_promp_env(env_id, wrappers, seed=SEED, mp_kwargs=mp_kwargs, - **kwargs) +env = make_promp_env(env_id, wrappers, seed=SEED, mp_kwargs=mp_kwargs, **kwargs) +env.action_space.seed(SEED) # Plot difference between real trajectory and target MP trajectory env.reset() -pos, vel = env.mp_rollout(env.action_space.sample()) +w = env.action_space.sample() # N(0,1) +visualize(env) +pos, vel = env.mp_rollout(w) base_shape = env.full_action_space.shape actual_pos = np.zeros((len(pos), *base_shape)) actual_vel = np.zeros((len(pos), *base_shape)) act = np.zeros((len(pos), *base_shape)) +plt.ion() +fig = plt.figure() +ax = fig.add_subplot(1, 1, 1) +img = ax.imshow(env.env.render("rgb_array")) +fig.show() + for t, pos_vel in enumerate(zip(pos, vel)): actions = env.policy.get_action(pos_vel[0], pos_vel[1],, self.current_vel, self.current_pos actions = np.clip(actions, env.full_action_space.low, env.full_action_space.high) _, _, _, _ = env.env.step(actions) + if t % 15 == 0: + img.set_data(env.env.render("rgb_array")) + fig.canvas.draw() + fig.canvas.flush_events() act[t, :] = actions # TODO verify for your environment actual_pos[t, :] = env.current_pos - actual_vel[t, :] = env.current_vel + actual_vel[t, :] = 0 # env.current_vel plt.figure(figsize=(15, 5)) plt.subplot(131) plt.title("Position") -plt.plot(actual_pos, c='C0', label=["true" if i == 0 else "" for i in range(np.prod(base_shape))]) +p1 = plt.plot(actual_pos, c='C0', label="true") # plt.plot(actual_pos_ball, label="true pos ball") -plt.plot(pos, c='C1', label=["MP" if i == 0 else "" for i in range(np.prod(base_shape))]) +p2 = plt.plot(pos, c='C1', label="MP") # , label=["MP" if i == 0 else None for i in range(np.prod(base_shape))]) plt.xlabel("Episode steps") -plt.legend() +# plt.legend() +handles, labels = plt.gca().get_legend_handles_labels() +from collections import OrderedDict + +by_label = OrderedDict(zip(labels, handles)) +plt.legend(by_label.values(), by_label.keys()) plt.subplot(132) plt.title("Velocity") -plt.plot(actual_vel, c='C0', label=[f"true" if i == 0 else "" for i in range(np.prod(base_shape))]) -plt.plot(vel, c='C1', label=[f"MP" if i == 0 else "" for i in range(np.prod(base_shape))]) +plt.plot(actual_vel, c='C0', label="true") +plt.plot(vel, c='C1', label="MP") plt.xlabel("Episode steps") -plt.legend() plt.subplot(133) -plt.title("Actions") +plt.title(f"Actions {np.std(act, axis=0)}") plt.plot(act, c="C0"), # label=[f"actions" if i == 0 else "" for i in range(np.prod(base_action_shape))]) plt.xlabel("Episode steps") # plt.legend()