From 36bf9b5b6a9ebb1484035a86d8c3263362e5a456 Mon Sep 17 00:00:00 2001 From: Maximilian Huettenrauch Date: Fri, 7 May 2021 09:51:53 +0200 Subject: [PATCH] start contextual dmp wrapper --- alr_envs/__init__.py | 75 +++++ alr_envs/classic_control/hole_reacher_v2.py | 307 ++++++++++++++++++++ alr_envs/classic_control/simple_reacher.py | 17 +- alr_envs/utils/mp_env_async_sampler.py | 38 ++- alr_envs/utils/wrapper/dmp_wrapper.py | 12 +- alr_envs/utils/wrapper/mp_wrapper.py | 6 +- 6 files changed, 445 insertions(+), 10 deletions(-) create mode 100644 alr_envs/classic_control/hole_reacher_v2.py diff --git a/alr_envs/__init__.py b/alr_envs/__init__.py index f814080..01e75ef 100644 --- a/alr_envs/__init__.py +++ b/alr_envs/__init__.py @@ -119,6 +119,16 @@ register( } ) +register( + id='SimpleReacher-v1', + entry_point='alr_envs.classic_control:SimpleReacherEnv', + max_episode_steps=200, + kwargs={ + "n_links": 2, + "random_start": False + } +) + register( id='LongSimpleReacher-v0', entry_point='alr_envs.classic_control:SimpleReacherEnv', @@ -154,8 +164,55 @@ register( } ) +register( + id='HoleReacher-v2', + entry_point='alr_envs.classic_control.hole_reacher_v2:HoleReacher', + max_episode_steps=200, + kwargs={ + "n_links": 5, + "allow_self_collision": False, + "allow_wall_collision": False, + "hole_width": 0.25, + "hole_depth": 1, + "hole_x": 2, + "collision_penalty": 100, + } +) + # MP environments +register( + id='SimpleReacherDMP-v0', + entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', + # max_episode_steps=1, + kwargs={ + "name": "alr_envs:SimpleReacher-v0", + "num_dof": 2, + "num_basis": 5, + "duration": 2, + "alpha_phase": 2, + "learn_goal": True, + "policy_type": "velocity", + "weights_scale": 50, + } +) + +register( + id='SimpleReacherDMP-v1', + entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', + # max_episode_steps=1, + kwargs={ + "name": "alr_envs:SimpleReacher-v1", + "num_dof": 2, + "num_basis": 5, + "duration": 2, + "alpha_phase": 2, + "learn_goal": True, + "policy_type": "velocity", + "weights_scale": 50, + } +) + register( id='ViaPointReacherDMP-v0', entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', @@ -190,6 +247,24 @@ register( } ) +register( + id='HoleReacherDMP-v2', + entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', + # max_episode_steps=1, + kwargs={ + "name": "alr_envs:HoleReacher-v2", + "num_dof": 5, + "num_basis": 5, + "duration": 2, + "learn_goal": True, + "alpha_phase": 2, + "bandwidth_factor": 2, + "policy_type": "velocity", + "weights_scale": 50, + "goal_scale": 0.1 + } +) + # TODO: properly add final_pos register( id='HoleReacherFixedGoalDMP-v0', diff --git a/alr_envs/classic_control/hole_reacher_v2.py b/alr_envs/classic_control/hole_reacher_v2.py new file mode 100644 index 0000000..028ab26 --- /dev/null +++ b/alr_envs/classic_control/hole_reacher_v2.py @@ -0,0 +1,307 @@ +import gym +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import patches +from alr_envs.classic_control.utils import check_self_collision + + +class HoleReacher(gym.Env): + + def __init__(self, n_links, hole_x, hole_width, hole_depth, allow_self_collision=False, + allow_wall_collision=False, collision_penalty=1000): + + self.n_links = n_links + self.link_lengths = np.ones((n_links, 1)) + + # task + self.hole_x = hole_x # x-position of center of hole + self.hole_width = hole_width # width of hole + self.hole_depth = hole_depth # depth of hole + + self.bottom_center_of_hole = np.hstack([hole_x, -hole_depth]) + self.top_center_of_hole = np.hstack([hole_x, 0]) + self.left_wall_edge = np.hstack([hole_x - self.hole_width / 2, 0]) + self.right_wall_edge = np.hstack([hole_x + self.hole_width / 2, 0]) + + # collision + self.allow_self_collision = allow_self_collision + self.allow_wall_collision = allow_wall_collision + self.collision_penalty = collision_penalty + + # state + self._joints = None + self._joint_angles = None + self._angle_velocity = None + self.start_pos = np.hstack([[np.pi / 2], np.zeros(self.n_links - 1)]) + self.start_vel = np.zeros(self.n_links) + + self.dt = 0.01 + # self.time_limit = 2 + + action_bound = np.pi * np.ones((self.n_links,)) + state_bound = np.hstack([ + [np.pi] * self.n_links, # cos + [np.pi] * self.n_links, # sin + [np.inf] * self.n_links, # velocity + [np.inf] * 2, # x-y coordinates of target distance + [np.inf] # env steps, because reward start after n steps TODO: Maybe + ]) + self.action_space = gym.spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape) + self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) + + self.fig = None + rect_1 = patches.Rectangle((-self.n_links, -1), + self.n_links + self.hole_x - self.hole_width / 2, 1, + fill=True, edgecolor='k', facecolor='k') + rect_2 = patches.Rectangle((self.hole_x + self.hole_width / 2, -1), + self.n_links - self.hole_x + self.hole_width / 2, 1, + fill=True, edgecolor='k', facecolor='k') + rect_3 = patches.Rectangle((self.hole_x - self.hole_width / 2, -1), self.hole_width, + 1 - self.hole_depth, + fill=True, edgecolor='k', facecolor='k') + + rect_4 = patches.Rectangle((-1, 0), # south west corner + 0.5, # width + self.n_links, # height + fill=True, edgecolor='k', facecolor='k') + + self.patches = [rect_1, rect_2, rect_3, rect_4] + + @property + def end_effector(self): + return self._joints[self.n_links].T + + def configure(self, context): + pass + + def reset(self): + self._joint_angles = self.start_pos + self._angle_velocity = self.start_vel + self._joints = np.zeros((self.n_links + 1, 2)) + self._update_joints() + self._steps = 0 + + return self._get_obs().copy() + + def step(self, action: np.ndarray): + """ + a single step with an action in joint velocity space + """ + vel = action # + 0.01 * np.random.randn(self.num_links) + acc = (vel - self._angle_velocity) / self.dt + self._angle_velocity = vel + self._joint_angles = self._joint_angles + self.dt * self._angle_velocity + + self._update_joints() + + # rew = self._reward() + + # compute reward directly in step function + + success = False + reward = 0 + if not self._is_collided: + if self._steps == 199: + dist = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) + reward = - dist ** 2 + success = dist < 0.005 + else: + dist = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) + # if self.collision_penalty != 0: + # reward = -self.collision_penalty + # else: + reward = - dist ** 2 - self.collision_penalty + + reward -= 5e-8 * np.sum(acc ** 2) + + info = {"is_collided": self._is_collided, "is_success": success} + + self._steps += 1 + + # done = self._steps * self.dt > self.time_limit or self._is_collided + done = self._is_collided + + return self._get_obs().copy(), reward, done, info + + def _update_joints(self): + """ + update _joints to get new end effector position. The other links are only required for rendering. + Returns: + + """ + line_points_in_taskspace = self.get_forward_kinematics(num_points_per_link=20) + + self._joints[1:, 0] = self._joints[0, 0] + line_points_in_taskspace[:, -1, 0] + self._joints[1:, 1] = self._joints[0, 1] + line_points_in_taskspace[:, -1, 1] + + self_collision = False + wall_collision = False + + if not self.allow_self_collision: + self_collision = check_self_collision(line_points_in_taskspace) + if np.any(np.abs(self._joint_angles) > np.pi) and not self.allow_self_collision: + self_collision = True + + if not self.allow_wall_collision: + wall_collision = self.check_wall_collision(line_points_in_taskspace) + + self._is_collided = self_collision or wall_collision + + def _get_obs(self): + theta = self._joint_angles + return np.hstack([ + np.cos(theta), + np.sin(theta), + self._angle_velocity, + self.end_effector - self.bottom_center_of_hole, + self._steps + ]) + + def get_forward_kinematics(self, num_points_per_link=1): + theta = self._joint_angles[:, None] + + if num_points_per_link > 1: + intermediate_points = np.linspace(0, 1, num_points_per_link) + else: + intermediate_points = 1 + + accumulated_theta = np.cumsum(theta, axis=0) + + endeffector = np.zeros(shape=(self.n_links, num_points_per_link, 2)) + + x = np.cos(accumulated_theta) * self.link_lengths * intermediate_points + y = np.sin(accumulated_theta) * self.link_lengths * intermediate_points + + endeffector[0, :, 0] = x[0, :] + endeffector[0, :, 1] = y[0, :] + + for i in range(1, self.n_links): + endeffector[i, :, 0] = x[i, :] + endeffector[i - 1, -1, 0] + endeffector[i, :, 1] = y[i, :] + endeffector[i - 1, -1, 1] + + return np.squeeze(endeffector + self._joints[0, :]) + + def check_wall_collision(self, line_points): + + # all points that are before the hole in x + r, c = np.where((line_points[:, :, 0] > -1) & (line_points[:, :, 0] < -0.5) & + (line_points[:, :, 1] > 0) & (line_points[:, :, 1] < self.n_links)) + + if len(r) > 0: + return True + + # all points that are before the hole in x + r, c = np.where(line_points[:, :, 0] < (self.hole_x - self.hole_width / 2)) + + # check if any of those points are below surface + nr_line_points_below_surface_before_hole = np.sum(line_points[r, c, 1] < 0) + + if nr_line_points_below_surface_before_hole > 0: + return True + + # all points that are after the hole in x + r, c = np.where(line_points[:, :, 0] > (self.hole_x + self.hole_width / 2)) + + # check if any of those points are below surface + nr_line_points_below_surface_after_hole = np.sum(line_points[r, c, 1] < 0) + + if nr_line_points_below_surface_after_hole > 0: + return True + + # all points that are above the hole + r, c = np.where((line_points[:, :, 0] > (self.hole_x - self.hole_width / 2)) & ( + line_points[:, :, 0] < (self.hole_x + self.hole_width / 2))) + + # check if any of those points are below surface + nr_line_points_below_surface_in_hole = np.sum(line_points[r, c, 1] < -self.hole_depth) + + if nr_line_points_below_surface_in_hole > 0: + return True + + return False + + def render(self, mode='human'): + if self.fig is None: + self.fig = plt.figure() + # plt.ion() + # plt.pause(0.01) + else: + plt.figure(self.fig.number) + + if mode == "human": + plt.cla() + plt.title(f"Iteration: {self._steps}, distance: {self.end_effector - self.bottom_center_of_hole}") + + # Arm + plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') + + # Add the patch to the Axes + [plt.gca().add_patch(rect) for rect in self.patches] + + lim = np.sum(self.link_lengths) + 0.5 + plt.xlim([-lim, lim]) + plt.ylim([-1.1, lim]) + # plt.draw() + plt.pause(1e-4) # pushes window to foreground, which is annoying. + # self.fig.canvas.flush_events() + + elif mode == "partial": + if self._steps == 1: + # fig, ax = plt.subplots() + # Add the patch to the Axes + [plt.gca().add_patch(rect) for rect in self.patches] + # plt.pause(0.01) + + if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided: + # Arm + plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k', alpha=self._steps / 200) + # ax.plot(line_points_in_taskspace[:, 0, 0], + # line_points_in_taskspace[:, 0, 1], + # line_points_in_taskspace[:, -1, 0], + # line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200) + + lim = np.sum(self.link_lengths) + 0.5 + plt.xlim([-lim, lim]) + plt.ylim([-1.1, lim]) + plt.pause(0.01) + + elif mode == "final": + if self._steps == 199 or self._is_collided: + # fig, ax = plt.subplots() + + # Add the patch to the Axes + [plt.gca().add_patch(rect) for rect in self.patches] + + plt.xlim(-self.n_links, self.n_links), plt.ylim(-1, self.n_links) + # Arm + plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') + + plt.pause(0.01) + + def close(self): + if self.fig is not None: + plt.close(self.fig) + + +if __name__ == '__main__': + nl = 5 + render_mode = "human" # "human" or "partial" or "final" + env = HoleReacher(n_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, + hole_depth=1, hole_x=1) + env.reset() + # env.render(mode=render_mode) + + for i in range(200): + # objective.load_result("/tmp/cma") + # test with random actions + ac = 2 * env.action_space.sample() + # ac[0] += np.pi/2 + obs, rew, d, info = env.step(ac) + env.render(mode=render_mode) + + print(rew) + + if d: + break + + env.close() diff --git a/alr_envs/classic_control/simple_reacher.py b/alr_envs/classic_control/simple_reacher.py index 296662c..8d552af 100644 --- a/alr_envs/classic_control/simple_reacher.py +++ b/alr_envs/classic_control/simple_reacher.py @@ -18,17 +18,20 @@ class SimpleReacherEnv(gym.Env): towards the end of the trajectory. """ - def __init__(self, n_links): + def __init__(self, n_links, random_start=True): super().__init__() self.link_lengths = np.ones(n_links) self.n_links = n_links self.dt = 0.1 + self.random_start = random_start + self._goal_pos = None self._joints = None self._joint_angle = None self._angle_velocity = None + self._start_pos = None self.max_torque = 1 # 10 self.steps_before_reward = 199 @@ -50,6 +53,10 @@ class SimpleReacherEnv(gym.Env): self._steps = 0 self.seed() + @property + def start_pos(self): + return self._start_pos + def step(self, action: np.ndarray): # action = self._add_action_noise(action) @@ -85,6 +92,7 @@ class SimpleReacherEnv(gym.Env): np.sin(theta), self._angle_velocity, self.end_effector - self._goal_pos, + self._goal_pos, self._steps ]) @@ -116,7 +124,12 @@ class SimpleReacherEnv(gym.Env): # TODO: maybe do initialisation more random? # Sample only orientation of first link, i.e. the arm is always straight. - self._joint_angle = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)]) + if self.random_start: + self._joint_angle = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)]) + else: + self._joint_angle = np.zeros(self.n_links) + + self._start_pos = self._joint_angle self._angle_velocity = np.zeros(self.n_links) self._joints = np.zeros((self.n_links + 1, 2)) self._update_joints() diff --git a/alr_envs/utils/mp_env_async_sampler.py b/alr_envs/utils/mp_env_async_sampler.py index 344cb37..b7c9c23 100644 --- a/alr_envs/utils/mp_env_async_sampler.py +++ b/alr_envs/utils/mp_env_async_sampler.py @@ -4,8 +4,8 @@ import numpy as np from _collections import defaultdict -def make_env(env_id, rank, seed=0): - env = gym.make(env_id) +def make_env(env_id, rank, seed=0, **env_kwargs): + env = gym.make(env_id, **env_kwargs) env.seed(seed + rank) return lambda: env @@ -45,9 +45,9 @@ class AlrMpEnvSampler: An asynchronous sampler for non contextual MPWrapper environments. A sampler object can be called with a set of parameters and returns the corresponding final obs, rewards, dones and info dicts. """ - def __init__(self, env_id, num_envs, seed=0): + def __init__(self, env_id, num_envs, seed=0, **env_kwargs): self.num_envs = num_envs - self.env = AsyncVectorEnv([make_env(env_id, seed, i) for i in range(num_envs)]) + self.env = AsyncVectorEnv([make_env(env_id, seed, i, **env_kwargs) for i in range(num_envs)]) def __call__(self, params): params = np.atleast_2d(params) @@ -67,6 +67,36 @@ class AlrMpEnvSampler: _flatten_list(vals['done'])[:n_samples], _flatten_list(vals['info'])[:n_samples] +class AlrContextualMpEnvSampler: + """ + An asynchronous sampler for non contextual MPWrapper environments. A sampler object can be called with a set of + parameters and returns the corresponding final obs, rewards, dones and info dicts. + """ + def __init__(self, env_id, num_envs, seed=0, **env_kwargs): + self.num_envs = num_envs + self.env = AsyncVectorEnv([make_env(env_id, seed, i, **env_kwargs) for i in range(num_envs)]) + + def __call__(self, dist, n_samples): + + repeat = int(np.ceil(n_samples / self.env.num_envs)) + vals = defaultdict(list) + for i in range(repeat): + obs = self.env.reset() + + new_contexts = obs[-2] + new_samples = dist.sample(new_contexts) + + obs, reward, done, info = self.env.step(p) + vals['obs'].append(obs) + vals['reward'].append(reward) + vals['done'].append(done) + vals['info'].append(info) + + # do not return values above threshold + return np.vstack(vals['obs'])[:n_samples], np.hstack(vals['reward'])[:n_samples],\ + _flatten_list(vals['done'])[:n_samples], _flatten_list(vals['info'])[:n_samples] + + if __name__ == "__main__": env_name = "alr_envs:ALRBallInACupSimpleDMP-v0" n_cpu = 8 diff --git a/alr_envs/utils/wrapper/dmp_wrapper.py b/alr_envs/utils/wrapper/dmp_wrapper.py index 8f94227..283b845 100644 --- a/alr_envs/utils/wrapper/dmp_wrapper.py +++ b/alr_envs/utils/wrapper/dmp_wrapper.py @@ -36,9 +36,10 @@ class DmpWrapper(MPWrapper): dt = env.dt if hasattr(env, "dt") else dt assert dt is not None start_pos = start_pos if start_pos is not None else env.start_pos if hasattr(env, "start_pos") else None - assert start_pos is not None + # TODO: assert start_pos is not None # start_pos will be set in initialize, do we need this here? if learn_goal: - final_pos = np.zeros_like(start_pos) # arbitrary, will be learned + # final_pos = np.zeros_like(start_pos) # arbitrary, will be learned + final_pos = np.zeros((1, num_dof)) # arbitrary, will be learned else: final_pos = final_pos if final_pos is not None else start_pos if return_to_start else None assert final_pos is not None @@ -62,7 +63,10 @@ class DmpWrapper(MPWrapper): dmp = dmps.DMP(num_dof=num_dof, basis_generator=basis_generator, phase_generator=phase_generator, num_time_steps=int(duration / dt), dt=dt) - dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) + # dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) + # in a contextual environment, the start_pos may be not fixed, set in mp_rollout? + # TODO: Should we set start_pos in init at all? It's only used after calling rollout anyway... + dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) if start_pos is not None else np.zeros((1, num_dof)) weights = np.zeros((num_basis, num_dof)) goal_pos = np.zeros(num_dof) if self.learn_goal else final_pos @@ -87,6 +91,8 @@ class DmpWrapper(MPWrapper): return goal_pos * self.goal_scale, weight_matrix * self.weights_scale def mp_rollout(self, action): + if self.mp.start_pos is None: + self.mp.start_pos = self.env.start_pos goal_pos, weight_matrix = self.goal_and_weights(action) self.mp.set_weights(weight_matrix, goal_pos) return self.mp.reference_trajectory(self.t) diff --git a/alr_envs/utils/wrapper/mp_wrapper.py b/alr_envs/utils/wrapper/mp_wrapper.py index 34f0440..f60cc8c 100644 --- a/alr_envs/utils/wrapper/mp_wrapper.py +++ b/alr_envs/utils/wrapper/mp_wrapper.py @@ -61,6 +61,9 @@ class MPWrapper(gym.Wrapper, ABC): def configure(self, context): self.env.configure(context) + def reset(self): + return self.env.reset() + def step(self, action: np.ndarray): """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" trajectory, velocity = self.mp_rollout(action) @@ -78,8 +81,9 @@ class MPWrapper(gym.Wrapper, ABC): # TODO: @Max Why do we need this configure, states should be part of the model # TODO: Ask Onur if the context distribution needs to be outside the environment # TODO: For now create a new env with each context + # TODO: Explicitly call reset before step to obtain context from obs? # self.env.configure(context) - obs = self.env.reset() + # obs = self.env.reset() info = {} for t, pos_vel in enumerate(zip(trajectory, velocity)):