From 36bf9b5b6a9ebb1484035a86d8c3263362e5a456 Mon Sep 17 00:00:00 2001 From: Maximilian Huettenrauch Date: Fri, 7 May 2021 09:51:53 +0200 Subject: [PATCH 01/11] start contextual dmp wrapper --- alr_envs/__init__.py | 75 +++++ alr_envs/classic_control/hole_reacher_v2.py | 307 ++++++++++++++++++++ alr_envs/classic_control/simple_reacher.py | 17 +- alr_envs/utils/mp_env_async_sampler.py | 38 ++- alr_envs/utils/wrapper/dmp_wrapper.py | 12 +- alr_envs/utils/wrapper/mp_wrapper.py | 6 +- 6 files changed, 445 insertions(+), 10 deletions(-) create mode 100644 alr_envs/classic_control/hole_reacher_v2.py diff --git a/alr_envs/__init__.py b/alr_envs/__init__.py index f814080..01e75ef 100644 --- a/alr_envs/__init__.py +++ b/alr_envs/__init__.py @@ -119,6 +119,16 @@ register( } ) +register( + id='SimpleReacher-v1', + entry_point='alr_envs.classic_control:SimpleReacherEnv', + max_episode_steps=200, + kwargs={ + "n_links": 2, + "random_start": False + } +) + register( id='LongSimpleReacher-v0', entry_point='alr_envs.classic_control:SimpleReacherEnv', @@ -154,8 +164,55 @@ register( } ) +register( + id='HoleReacher-v2', + entry_point='alr_envs.classic_control.hole_reacher_v2:HoleReacher', + max_episode_steps=200, + kwargs={ + "n_links": 5, + "allow_self_collision": False, + "allow_wall_collision": False, + "hole_width": 0.25, + "hole_depth": 1, + "hole_x": 2, + "collision_penalty": 100, + } +) + # MP environments +register( + id='SimpleReacherDMP-v0', + entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', + # max_episode_steps=1, + kwargs={ + "name": "alr_envs:SimpleReacher-v0", + "num_dof": 2, + "num_basis": 5, + "duration": 2, + "alpha_phase": 2, + "learn_goal": True, + "policy_type": "velocity", + "weights_scale": 50, + } +) + +register( + id='SimpleReacherDMP-v1', + entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', + # max_episode_steps=1, + kwargs={ + "name": "alr_envs:SimpleReacher-v1", + "num_dof": 2, + "num_basis": 5, + "duration": 2, + "alpha_phase": 2, + "learn_goal": True, + "policy_type": "velocity", + "weights_scale": 50, + } +) + register( id='ViaPointReacherDMP-v0', entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', @@ -190,6 +247,24 @@ register( } ) +register( + id='HoleReacherDMP-v2', + entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', + # max_episode_steps=1, + kwargs={ + "name": "alr_envs:HoleReacher-v2", + "num_dof": 5, + "num_basis": 5, + "duration": 2, + "learn_goal": True, + "alpha_phase": 2, + "bandwidth_factor": 2, + "policy_type": "velocity", + "weights_scale": 50, + "goal_scale": 0.1 + } +) + # TODO: properly add final_pos register( id='HoleReacherFixedGoalDMP-v0', diff --git a/alr_envs/classic_control/hole_reacher_v2.py b/alr_envs/classic_control/hole_reacher_v2.py new file mode 100644 index 0000000..028ab26 --- /dev/null +++ b/alr_envs/classic_control/hole_reacher_v2.py @@ -0,0 +1,307 @@ +import gym +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import patches +from alr_envs.classic_control.utils import check_self_collision + + +class HoleReacher(gym.Env): + + def __init__(self, n_links, hole_x, hole_width, hole_depth, allow_self_collision=False, + allow_wall_collision=False, collision_penalty=1000): + + self.n_links = n_links + self.link_lengths = np.ones((n_links, 1)) + + # task + self.hole_x = hole_x # x-position of center of hole + self.hole_width = hole_width # width of hole + self.hole_depth = hole_depth # depth of hole + + self.bottom_center_of_hole = np.hstack([hole_x, -hole_depth]) + self.top_center_of_hole = np.hstack([hole_x, 0]) + self.left_wall_edge = np.hstack([hole_x - self.hole_width / 2, 0]) + self.right_wall_edge = np.hstack([hole_x + self.hole_width / 2, 0]) + + # collision + self.allow_self_collision = allow_self_collision + self.allow_wall_collision = allow_wall_collision + self.collision_penalty = collision_penalty + + # state + self._joints = None + self._joint_angles = None + self._angle_velocity = None + self.start_pos = np.hstack([[np.pi / 2], np.zeros(self.n_links - 1)]) + self.start_vel = np.zeros(self.n_links) + + self.dt = 0.01 + # self.time_limit = 2 + + action_bound = np.pi * np.ones((self.n_links,)) + state_bound = np.hstack([ + [np.pi] * self.n_links, # cos + [np.pi] * self.n_links, # sin + [np.inf] * self.n_links, # velocity + [np.inf] * 2, # x-y coordinates of target distance + [np.inf] # env steps, because reward start after n steps TODO: Maybe + ]) + self.action_space = gym.spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape) + self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) + + self.fig = None + rect_1 = patches.Rectangle((-self.n_links, -1), + self.n_links + self.hole_x - self.hole_width / 2, 1, + fill=True, edgecolor='k', facecolor='k') + rect_2 = patches.Rectangle((self.hole_x + self.hole_width / 2, -1), + self.n_links - self.hole_x + self.hole_width / 2, 1, + fill=True, edgecolor='k', facecolor='k') + rect_3 = patches.Rectangle((self.hole_x - self.hole_width / 2, -1), self.hole_width, + 1 - self.hole_depth, + fill=True, edgecolor='k', facecolor='k') + + rect_4 = patches.Rectangle((-1, 0), # south west corner + 0.5, # width + self.n_links, # height + fill=True, edgecolor='k', facecolor='k') + + self.patches = [rect_1, rect_2, rect_3, rect_4] + + @property + def end_effector(self): + return self._joints[self.n_links].T + + def configure(self, context): + pass + + def reset(self): + self._joint_angles = self.start_pos + self._angle_velocity = self.start_vel + self._joints = np.zeros((self.n_links + 1, 2)) + self._update_joints() + self._steps = 0 + + return self._get_obs().copy() + + def step(self, action: np.ndarray): + """ + a single step with an action in joint velocity space + """ + vel = action # + 0.01 * np.random.randn(self.num_links) + acc = (vel - self._angle_velocity) / self.dt + self._angle_velocity = vel + self._joint_angles = self._joint_angles + self.dt * self._angle_velocity + + self._update_joints() + + # rew = self._reward() + + # compute reward directly in step function + + success = False + reward = 0 + if not self._is_collided: + if self._steps == 199: + dist = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) + reward = - dist ** 2 + success = dist < 0.005 + else: + dist = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) + # if self.collision_penalty != 0: + # reward = -self.collision_penalty + # else: + reward = - dist ** 2 - self.collision_penalty + + reward -= 5e-8 * np.sum(acc ** 2) + + info = {"is_collided": self._is_collided, "is_success": success} + + self._steps += 1 + + # done = self._steps * self.dt > self.time_limit or self._is_collided + done = self._is_collided + + return self._get_obs().copy(), reward, done, info + + def _update_joints(self): + """ + update _joints to get new end effector position. The other links are only required for rendering. + Returns: + + """ + line_points_in_taskspace = self.get_forward_kinematics(num_points_per_link=20) + + self._joints[1:, 0] = self._joints[0, 0] + line_points_in_taskspace[:, -1, 0] + self._joints[1:, 1] = self._joints[0, 1] + line_points_in_taskspace[:, -1, 1] + + self_collision = False + wall_collision = False + + if not self.allow_self_collision: + self_collision = check_self_collision(line_points_in_taskspace) + if np.any(np.abs(self._joint_angles) > np.pi) and not self.allow_self_collision: + self_collision = True + + if not self.allow_wall_collision: + wall_collision = self.check_wall_collision(line_points_in_taskspace) + + self._is_collided = self_collision or wall_collision + + def _get_obs(self): + theta = self._joint_angles + return np.hstack([ + np.cos(theta), + np.sin(theta), + self._angle_velocity, + self.end_effector - self.bottom_center_of_hole, + self._steps + ]) + + def get_forward_kinematics(self, num_points_per_link=1): + theta = self._joint_angles[:, None] + + if num_points_per_link > 1: + intermediate_points = np.linspace(0, 1, num_points_per_link) + else: + intermediate_points = 1 + + accumulated_theta = np.cumsum(theta, axis=0) + + endeffector = np.zeros(shape=(self.n_links, num_points_per_link, 2)) + + x = np.cos(accumulated_theta) * self.link_lengths * intermediate_points + y = np.sin(accumulated_theta) * self.link_lengths * intermediate_points + + endeffector[0, :, 0] = x[0, :] + endeffector[0, :, 1] = y[0, :] + + for i in range(1, self.n_links): + endeffector[i, :, 0] = x[i, :] + endeffector[i - 1, -1, 0] + endeffector[i, :, 1] = y[i, :] + endeffector[i - 1, -1, 1] + + return np.squeeze(endeffector + self._joints[0, :]) + + def check_wall_collision(self, line_points): + + # all points that are before the hole in x + r, c = np.where((line_points[:, :, 0] > -1) & (line_points[:, :, 0] < -0.5) & + (line_points[:, :, 1] > 0) & (line_points[:, :, 1] < self.n_links)) + + if len(r) > 0: + return True + + # all points that are before the hole in x + r, c = np.where(line_points[:, :, 0] < (self.hole_x - self.hole_width / 2)) + + # check if any of those points are below surface + nr_line_points_below_surface_before_hole = np.sum(line_points[r, c, 1] < 0) + + if nr_line_points_below_surface_before_hole > 0: + return True + + # all points that are after the hole in x + r, c = np.where(line_points[:, :, 0] > (self.hole_x + self.hole_width / 2)) + + # check if any of those points are below surface + nr_line_points_below_surface_after_hole = np.sum(line_points[r, c, 1] < 0) + + if nr_line_points_below_surface_after_hole > 0: + return True + + # all points that are above the hole + r, c = np.where((line_points[:, :, 0] > (self.hole_x - self.hole_width / 2)) & ( + line_points[:, :, 0] < (self.hole_x + self.hole_width / 2))) + + # check if any of those points are below surface + nr_line_points_below_surface_in_hole = np.sum(line_points[r, c, 1] < -self.hole_depth) + + if nr_line_points_below_surface_in_hole > 0: + return True + + return False + + def render(self, mode='human'): + if self.fig is None: + self.fig = plt.figure() + # plt.ion() + # plt.pause(0.01) + else: + plt.figure(self.fig.number) + + if mode == "human": + plt.cla() + plt.title(f"Iteration: {self._steps}, distance: {self.end_effector - self.bottom_center_of_hole}") + + # Arm + plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') + + # Add the patch to the Axes + [plt.gca().add_patch(rect) for rect in self.patches] + + lim = np.sum(self.link_lengths) + 0.5 + plt.xlim([-lim, lim]) + plt.ylim([-1.1, lim]) + # plt.draw() + plt.pause(1e-4) # pushes window to foreground, which is annoying. + # self.fig.canvas.flush_events() + + elif mode == "partial": + if self._steps == 1: + # fig, ax = plt.subplots() + # Add the patch to the Axes + [plt.gca().add_patch(rect) for rect in self.patches] + # plt.pause(0.01) + + if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided: + # Arm + plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k', alpha=self._steps / 200) + # ax.plot(line_points_in_taskspace[:, 0, 0], + # line_points_in_taskspace[:, 0, 1], + # line_points_in_taskspace[:, -1, 0], + # line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200) + + lim = np.sum(self.link_lengths) + 0.5 + plt.xlim([-lim, lim]) + plt.ylim([-1.1, lim]) + plt.pause(0.01) + + elif mode == "final": + if self._steps == 199 or self._is_collided: + # fig, ax = plt.subplots() + + # Add the patch to the Axes + [plt.gca().add_patch(rect) for rect in self.patches] + + plt.xlim(-self.n_links, self.n_links), plt.ylim(-1, self.n_links) + # Arm + plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') + + plt.pause(0.01) + + def close(self): + if self.fig is not None: + plt.close(self.fig) + + +if __name__ == '__main__': + nl = 5 + render_mode = "human" # "human" or "partial" or "final" + env = HoleReacher(n_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, + hole_depth=1, hole_x=1) + env.reset() + # env.render(mode=render_mode) + + for i in range(200): + # objective.load_result("/tmp/cma") + # test with random actions + ac = 2 * env.action_space.sample() + # ac[0] += np.pi/2 + obs, rew, d, info = env.step(ac) + env.render(mode=render_mode) + + print(rew) + + if d: + break + + env.close() diff --git a/alr_envs/classic_control/simple_reacher.py b/alr_envs/classic_control/simple_reacher.py index 296662c..8d552af 100644 --- a/alr_envs/classic_control/simple_reacher.py +++ b/alr_envs/classic_control/simple_reacher.py @@ -18,17 +18,20 @@ class SimpleReacherEnv(gym.Env): towards the end of the trajectory. """ - def __init__(self, n_links): + def __init__(self, n_links, random_start=True): super().__init__() self.link_lengths = np.ones(n_links) self.n_links = n_links self.dt = 0.1 + self.random_start = random_start + self._goal_pos = None self._joints = None self._joint_angle = None self._angle_velocity = None + self._start_pos = None self.max_torque = 1 # 10 self.steps_before_reward = 199 @@ -50,6 +53,10 @@ class SimpleReacherEnv(gym.Env): self._steps = 0 self.seed() + @property + def start_pos(self): + return self._start_pos + def step(self, action: np.ndarray): # action = self._add_action_noise(action) @@ -85,6 +92,7 @@ class SimpleReacherEnv(gym.Env): np.sin(theta), self._angle_velocity, self.end_effector - self._goal_pos, + self._goal_pos, self._steps ]) @@ -116,7 +124,12 @@ class SimpleReacherEnv(gym.Env): # TODO: maybe do initialisation more random? # Sample only orientation of first link, i.e. the arm is always straight. - self._joint_angle = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)]) + if self.random_start: + self._joint_angle = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)]) + else: + self._joint_angle = np.zeros(self.n_links) + + self._start_pos = self._joint_angle self._angle_velocity = np.zeros(self.n_links) self._joints = np.zeros((self.n_links + 1, 2)) self._update_joints() diff --git a/alr_envs/utils/mp_env_async_sampler.py b/alr_envs/utils/mp_env_async_sampler.py index 344cb37..b7c9c23 100644 --- a/alr_envs/utils/mp_env_async_sampler.py +++ b/alr_envs/utils/mp_env_async_sampler.py @@ -4,8 +4,8 @@ import numpy as np from _collections import defaultdict -def make_env(env_id, rank, seed=0): - env = gym.make(env_id) +def make_env(env_id, rank, seed=0, **env_kwargs): + env = gym.make(env_id, **env_kwargs) env.seed(seed + rank) return lambda: env @@ -45,9 +45,9 @@ class AlrMpEnvSampler: An asynchronous sampler for non contextual MPWrapper environments. A sampler object can be called with a set of parameters and returns the corresponding final obs, rewards, dones and info dicts. """ - def __init__(self, env_id, num_envs, seed=0): + def __init__(self, env_id, num_envs, seed=0, **env_kwargs): self.num_envs = num_envs - self.env = AsyncVectorEnv([make_env(env_id, seed, i) for i in range(num_envs)]) + self.env = AsyncVectorEnv([make_env(env_id, seed, i, **env_kwargs) for i in range(num_envs)]) def __call__(self, params): params = np.atleast_2d(params) @@ -67,6 +67,36 @@ class AlrMpEnvSampler: _flatten_list(vals['done'])[:n_samples], _flatten_list(vals['info'])[:n_samples] +class AlrContextualMpEnvSampler: + """ + An asynchronous sampler for non contextual MPWrapper environments. A sampler object can be called with a set of + parameters and returns the corresponding final obs, rewards, dones and info dicts. + """ + def __init__(self, env_id, num_envs, seed=0, **env_kwargs): + self.num_envs = num_envs + self.env = AsyncVectorEnv([make_env(env_id, seed, i, **env_kwargs) for i in range(num_envs)]) + + def __call__(self, dist, n_samples): + + repeat = int(np.ceil(n_samples / self.env.num_envs)) + vals = defaultdict(list) + for i in range(repeat): + obs = self.env.reset() + + new_contexts = obs[-2] + new_samples = dist.sample(new_contexts) + + obs, reward, done, info = self.env.step(p) + vals['obs'].append(obs) + vals['reward'].append(reward) + vals['done'].append(done) + vals['info'].append(info) + + # do not return values above threshold + return np.vstack(vals['obs'])[:n_samples], np.hstack(vals['reward'])[:n_samples],\ + _flatten_list(vals['done'])[:n_samples], _flatten_list(vals['info'])[:n_samples] + + if __name__ == "__main__": env_name = "alr_envs:ALRBallInACupSimpleDMP-v0" n_cpu = 8 diff --git a/alr_envs/utils/wrapper/dmp_wrapper.py b/alr_envs/utils/wrapper/dmp_wrapper.py index 8f94227..283b845 100644 --- a/alr_envs/utils/wrapper/dmp_wrapper.py +++ b/alr_envs/utils/wrapper/dmp_wrapper.py @@ -36,9 +36,10 @@ class DmpWrapper(MPWrapper): dt = env.dt if hasattr(env, "dt") else dt assert dt is not None start_pos = start_pos if start_pos is not None else env.start_pos if hasattr(env, "start_pos") else None - assert start_pos is not None + # TODO: assert start_pos is not None # start_pos will be set in initialize, do we need this here? if learn_goal: - final_pos = np.zeros_like(start_pos) # arbitrary, will be learned + # final_pos = np.zeros_like(start_pos) # arbitrary, will be learned + final_pos = np.zeros((1, num_dof)) # arbitrary, will be learned else: final_pos = final_pos if final_pos is not None else start_pos if return_to_start else None assert final_pos is not None @@ -62,7 +63,10 @@ class DmpWrapper(MPWrapper): dmp = dmps.DMP(num_dof=num_dof, basis_generator=basis_generator, phase_generator=phase_generator, num_time_steps=int(duration / dt), dt=dt) - dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) + # dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) + # in a contextual environment, the start_pos may be not fixed, set in mp_rollout? + # TODO: Should we set start_pos in init at all? It's only used after calling rollout anyway... + dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) if start_pos is not None else np.zeros((1, num_dof)) weights = np.zeros((num_basis, num_dof)) goal_pos = np.zeros(num_dof) if self.learn_goal else final_pos @@ -87,6 +91,8 @@ class DmpWrapper(MPWrapper): return goal_pos * self.goal_scale, weight_matrix * self.weights_scale def mp_rollout(self, action): + if self.mp.start_pos is None: + self.mp.start_pos = self.env.start_pos goal_pos, weight_matrix = self.goal_and_weights(action) self.mp.set_weights(weight_matrix, goal_pos) return self.mp.reference_trajectory(self.t) diff --git a/alr_envs/utils/wrapper/mp_wrapper.py b/alr_envs/utils/wrapper/mp_wrapper.py index 34f0440..f60cc8c 100644 --- a/alr_envs/utils/wrapper/mp_wrapper.py +++ b/alr_envs/utils/wrapper/mp_wrapper.py @@ -61,6 +61,9 @@ class MPWrapper(gym.Wrapper, ABC): def configure(self, context): self.env.configure(context) + def reset(self): + return self.env.reset() + def step(self, action: np.ndarray): """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" trajectory, velocity = self.mp_rollout(action) @@ -78,8 +81,9 @@ class MPWrapper(gym.Wrapper, ABC): # TODO: @Max Why do we need this configure, states should be part of the model # TODO: Ask Onur if the context distribution needs to be outside the environment # TODO: For now create a new env with each context + # TODO: Explicitly call reset before step to obtain context from obs? # self.env.configure(context) - obs = self.env.reset() + # obs = self.env.reset() info = {} for t, pos_vel in enumerate(zip(trajectory, velocity)): From b4ad3e6ddd7a9cab691e77fdc293690834da2529 Mon Sep 17 00:00:00 2001 From: Maximilian Huettenrauch Date: Mon, 10 May 2021 12:17:52 +0200 Subject: [PATCH 02/11] wip --- alr_envs/__init__.py | 23 +++++++++- alr_envs/classic_control/__init__.py | 1 + .../episodic_simple_reacher.py | 46 +++++++++++++++++++ alr_envs/classic_control/simple_reacher.py | 17 +++---- alr_envs/utils/mp_env_async_sampler.py | 5 +- alr_envs/utils/wrapper/dmp_wrapper.py | 42 +++++++++-------- alr_envs/utils/wrapper/mp_wrapper.py | 3 +- example.py | 3 +- 8 files changed, 104 insertions(+), 36 deletions(-) create mode 100644 alr_envs/classic_control/episodic_simple_reacher.py diff --git a/alr_envs/__init__.py b/alr_envs/__init__.py index 01e75ef..8e46fa9 100644 --- a/alr_envs/__init__.py +++ b/alr_envs/__init__.py @@ -129,6 +129,25 @@ register( } ) +register( + id='EpisodicSimpleReacher-v0', + entry_point='alr_envs.classic_control:EpisodicSimpleReacherEnv', + max_episode_steps=200, + kwargs={ + "n_links": 2, + } +) + +register( + id='EpisodicSimpleReacher-v1', + entry_point='alr_envs.classic_control:EpisodicSimpleReacherEnv', + max_episode_steps=200, + kwargs={ + "n_links": 2, + "random_start": False + } +) + register( id='LongSimpleReacher-v0', entry_point='alr_envs.classic_control:SimpleReacherEnv', @@ -186,7 +205,7 @@ register( entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', # max_episode_steps=1, kwargs={ - "name": "alr_envs:SimpleReacher-v0", + "name": "alr_envs:EpisodicSimpleReacher-v0", "num_dof": 2, "num_basis": 5, "duration": 2, @@ -202,7 +221,7 @@ register( entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', # max_episode_steps=1, kwargs={ - "name": "alr_envs:SimpleReacher-v1", + "name": "alr_envs:EpisodicSimpleReacher-v1", "num_dof": 2, "num_basis": 5, "duration": 2, diff --git a/alr_envs/classic_control/__init__.py b/alr_envs/classic_control/__init__.py index 8d31d19..8087136 100644 --- a/alr_envs/classic_control/__init__.py +++ b/alr_envs/classic_control/__init__.py @@ -1,3 +1,4 @@ from alr_envs.classic_control.simple_reacher import SimpleReacherEnv +from alr_envs.classic_control.episodic_simple_reacher import EpisodicSimpleReacherEnv from alr_envs.classic_control.viapoint_reacher import ViaPointReacher from alr_envs.classic_control.hole_reacher import HoleReacher diff --git a/alr_envs/classic_control/episodic_simple_reacher.py b/alr_envs/classic_control/episodic_simple_reacher.py new file mode 100644 index 0000000..b02efe8 --- /dev/null +++ b/alr_envs/classic_control/episodic_simple_reacher.py @@ -0,0 +1,46 @@ +from alr_envs.classic_control.simple_reacher import SimpleReacherEnv +from gym import spaces +import numpy as np + + +class EpisodicSimpleReacherEnv(SimpleReacherEnv): + def __init__(self, n_links, random_start=True): + super(EpisodicSimpleReacherEnv, self).__init__(n_links, random_start) + + # self._goal_pos = None + + if random_start: + state_bound = np.hstack([ + [np.pi] * self.n_links, # cos + [np.pi] * self.n_links, # sin + [np.inf] * self.n_links, # velocity + ]) + else: + state_bound = np.empty(0, ) + + state_bound = np.hstack([ + state_bound, + [np.inf] * 2, # x-y coordinates of goal + ]) + + self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) + + @property + def start_pos(self): + return self._start_pos + + # @property + # def goal_pos(self): + # return self._goal_pos + + def _get_obs(self): + if self.random_start: + theta = self._joint_angle + return np.hstack([ + np.cos(theta), + np.sin(theta), + self._angle_velocity, + self._goal, + ]) + else: + return self._goal diff --git a/alr_envs/classic_control/simple_reacher.py b/alr_envs/classic_control/simple_reacher.py index 8d552af..7ca4ead 100644 --- a/alr_envs/classic_control/simple_reacher.py +++ b/alr_envs/classic_control/simple_reacher.py @@ -26,7 +26,7 @@ class SimpleReacherEnv(gym.Env): self.random_start = random_start - self._goal_pos = None + self._goal = None self._joints = None self._joint_angle = None @@ -53,10 +53,6 @@ class SimpleReacherEnv(gym.Env): self._steps = 0 self.seed() - @property - def start_pos(self): - return self._start_pos - def step(self, action: np.ndarray): # action = self._add_action_noise(action) @@ -91,8 +87,7 @@ class SimpleReacherEnv(gym.Env): np.cos(theta), np.sin(theta), self._angle_velocity, - self.end_effector - self._goal_pos, - self._goal_pos, + self.end_effector - self._goal, self._steps ]) @@ -107,7 +102,7 @@ class SimpleReacherEnv(gym.Env): self._joints[1:] = self._joints[0] + np.cumsum(x.T, axis=0) def _get_reward(self, action: np.ndarray): - diff = self.end_effector - self._goal_pos + diff = self.end_effector - self._goal reward_dist = 0 # TODO: Is this the best option @@ -135,7 +130,7 @@ class SimpleReacherEnv(gym.Env): self._update_joints() self._steps = 0 - self._goal_pos = self._get_random_goal() + self._goal = self._get_random_goal() return self._get_obs().copy() def _get_random_goal(self): @@ -160,13 +155,13 @@ class SimpleReacherEnv(gym.Env): plt.figure(self.fig.number) plt.cla() - plt.title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal_pos}") + plt.title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}") # Arm plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') # goal - goal_pos = self._goal_pos.T + goal_pos = self._goal.T plt.plot(goal_pos[0], goal_pos[1], 'gx') # distance between end effector and goal plt.plot([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]], 'g--') diff --git a/alr_envs/utils/mp_env_async_sampler.py b/alr_envs/utils/mp_env_async_sampler.py index b7c9c23..59cf594 100644 --- a/alr_envs/utils/mp_env_async_sampler.py +++ b/alr_envs/utils/mp_env_async_sampler.py @@ -81,12 +81,11 @@ class AlrContextualMpEnvSampler: repeat = int(np.ceil(n_samples / self.env.num_envs)) vals = defaultdict(list) for i in range(repeat): - obs = self.env.reset() + new_contexts = self.env.reset() - new_contexts = obs[-2] new_samples = dist.sample(new_contexts) - obs, reward, done, info = self.env.step(p) + obs, reward, done, info = self.env.step(new_samples) vals['obs'].append(obs) vals['reward'].append(reward) vals['done'].append(done) diff --git a/alr_envs/utils/wrapper/dmp_wrapper.py b/alr_envs/utils/wrapper/dmp_wrapper.py index 283b845..2a198db 100644 --- a/alr_envs/utils/wrapper/dmp_wrapper.py +++ b/alr_envs/utils/wrapper/dmp_wrapper.py @@ -9,8 +9,10 @@ from alr_envs.utils.wrapper.mp_wrapper import MPWrapper class DmpWrapper(MPWrapper): - def __init__(self, env: gym.Env, num_dof: int, num_basis: int, start_pos: np.ndarray = None, - final_pos: np.ndarray = None, duration: int = 1, alpha_phase: float = 2., dt: float = None, + def __init__(self, env: gym.Env, num_dof: int, num_basis: int, + # start_pos: np.ndarray = None, + # final_pos: np.ndarray = None, + duration: int = 1, alpha_phase: float = 2., dt: float = None, learn_goal: bool = False, return_to_start: bool = False, post_traj_time: float = 0., weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3., policy_type: str = None, render_mode: str = None): @@ -35,26 +37,30 @@ class DmpWrapper(MPWrapper): self.learn_goal = learn_goal dt = env.dt if hasattr(env, "dt") else dt assert dt is not None - start_pos = start_pos if start_pos is not None else env.start_pos if hasattr(env, "start_pos") else None + # start_pos = start_pos if start_pos is not None else env.start_pos if hasattr(env, "start_pos") else None # TODO: assert start_pos is not None # start_pos will be set in initialize, do we need this here? - if learn_goal: + # if learn_goal: # final_pos = np.zeros_like(start_pos) # arbitrary, will be learned - final_pos = np.zeros((1, num_dof)) # arbitrary, will be learned - else: - final_pos = final_pos if final_pos is not None else start_pos if return_to_start else None - assert final_pos is not None + # final_pos = np.zeros((1, num_dof)) # arbitrary, will be learned + # else: + # final_pos = final_pos if final_pos is not None else start_pos if return_to_start else None + # assert final_pos is not None self.t = np.linspace(0, duration, int(duration / dt)) self.goal_scale = goal_scale super().__init__(env, num_dof, duration, dt, post_traj_time, policy_type, weights_scale, render_mode, - num_basis=num_basis, start_pos=start_pos, final_pos=final_pos, alpha_phase=alpha_phase, + num_basis=num_basis, + # start_pos=start_pos, final_pos=final_pos, + alpha_phase=alpha_phase, bandwidth_factor=bandwidth_factor) action_bounds = np.inf * np.ones((np.prod(self.mp.dmp_weights.shape) + (num_dof if learn_goal else 0))) self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) - def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, start_pos: np.ndarray = None, - final_pos: np.ndarray = None, alpha_phase: float = 2., bandwidth_factor: float = 3.): + def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, + # start_pos: np.ndarray = None, + # final_pos: np.ndarray = None, + alpha_phase: float = 2., bandwidth_factor: float = 3.): phase_generator = ExpDecayPhaseGenerator(alpha_phase=alpha_phase, duration=duration) basis_generator = DMPBasisGenerator(phase_generator, duration=duration, num_basis=num_basis, @@ -66,12 +72,12 @@ class DmpWrapper(MPWrapper): # dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) # in a contextual environment, the start_pos may be not fixed, set in mp_rollout? # TODO: Should we set start_pos in init at all? It's only used after calling rollout anyway... - dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) if start_pos is not None else np.zeros((1, num_dof)) + # dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) if start_pos is not None else np.zeros((1, num_dof)) - weights = np.zeros((num_basis, num_dof)) - goal_pos = np.zeros(num_dof) if self.learn_goal else final_pos + # weights = np.zeros((num_basis, num_dof)) + # goal_pos = np.zeros(num_dof) if self.learn_goal else final_pos - dmp.set_weights(weights, goal_pos) + # dmp.set_weights(weights, goal_pos) return dmp def goal_and_weights(self, params): @@ -83,7 +89,7 @@ class DmpWrapper(MPWrapper): params = params[:, :-self.mp.num_dimensions] # [1,num_dof] # weight_matrix = np.reshape(params[:, :-self.num_dof], [self.num_basis, self.num_dof]) else: - goal_pos = self.mp.dmp_goal_pos.flatten() + goal_pos = self.env.goal_pos # self.mp.dmp_goal_pos.flatten() assert goal_pos is not None # weight_matrix = np.reshape(params, [self.num_basis, self.num_dof]) @@ -91,8 +97,8 @@ class DmpWrapper(MPWrapper): return goal_pos * self.goal_scale, weight_matrix * self.weights_scale def mp_rollout(self, action): - if self.mp.start_pos is None: - self.mp.start_pos = self.env.start_pos + # if self.mp.start_pos is None: + self.mp.dmp_start_pos = self.env.init_qpos # start_pos goal_pos, weight_matrix = self.goal_and_weights(action) self.mp.set_weights(weight_matrix, goal_pos) return self.mp.reference_trajectory(self.t) diff --git a/alr_envs/utils/wrapper/mp_wrapper.py b/alr_envs/utils/wrapper/mp_wrapper.py index f60cc8c..adeba55 100644 --- a/alr_envs/utils/wrapper/mp_wrapper.py +++ b/alr_envs/utils/wrapper/mp_wrapper.py @@ -62,7 +62,8 @@ class MPWrapper(gym.Wrapper, ABC): self.env.configure(context) def reset(self): - return self.env.reset() + obs = self.env.reset() + return obs def step(self, action: np.ndarray): """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" diff --git a/example.py b/example.py index 94da23c..2d32ad8 100644 --- a/example.py +++ b/example.py @@ -83,5 +83,6 @@ if __name__ == '__main__': # example_mujoco() # example_dmp() # example_async() - env = gym.make("alr_envs:HoleReacherDMP-v0", context=0.1) + # env = gym.make("alr_envs:HoleReacherDMP-v0", context=0.1) + env = gym.make("alr_envs:SimpleReacherDMP-v1") print() \ No newline at end of file From 95e9b8be47d8d7d2a508082c43db80aa2d3e6fe5 Mon Sep 17 00:00:00 2001 From: ottofabian Date: Wed, 12 May 2021 09:52:25 +0200 Subject: [PATCH 03/11] added MPEnv --- alr_envs/__init__.py | 2 +- alr_envs/classic_control/hole_reacher.py | 195 +++++++++++------- alr_envs/mujoco/ball_in_a_cup/utils.py | 4 +- alr_envs/mujoco/beerpong/utils.py | 2 +- alr_envs/utils/legacy/utils.py | 10 +- alr_envs/utils/make_env_helpers.py | 4 +- alr_envs/utils/{wrapper => mps}/__init__.py | 0 .../utils/{wrapper => mps}/detpmp_wrapper.py | 13 +- .../utils/{wrapper => mps}/dmp_wrapper.py | 56 ++--- alr_envs/utils/mps/mp_environments.py | 33 +++ alr_envs/utils/{wrapper => mps}/mp_wrapper.py | 49 +---- 11 files changed, 200 insertions(+), 168 deletions(-) rename alr_envs/utils/{wrapper => mps}/__init__.py (100%) rename alr_envs/utils/{wrapper => mps}/detpmp_wrapper.py (66%) rename alr_envs/utils/{wrapper => mps}/dmp_wrapper.py (53%) create mode 100644 alr_envs/utils/mps/mp_environments.py rename alr_envs/utils/{wrapper => mps}/mp_wrapper.py (68%) diff --git a/alr_envs/__init__.py b/alr_envs/__init__.py index 8e46fa9..fe32c0b 100644 --- a/alr_envs/__init__.py +++ b/alr_envs/__init__.py @@ -1,7 +1,7 @@ from gym.envs.registration import register from alr_envs.stochastic_search.functions.f_rosenbrock import Rosenbrock -# from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper +# from alr_envs.utils.mps.dmp_wrapper import DmpWrapper # Mujoco diff --git a/alr_envs/classic_control/hole_reacher.py b/alr_envs/classic_control/hole_reacher.py index 3b382f9..45bc1c3 100644 --- a/alr_envs/classic_control/hole_reacher.py +++ b/alr_envs/classic_control/hole_reacher.py @@ -1,27 +1,34 @@ +from typing import Union + import gym -import numpy as np import matplotlib.pyplot as plt +import numpy as np +from gym.utils import seeding from matplotlib import patches + from alr_envs.classic_control.utils import check_self_collision +from alr_envs.utils.mps.mp_environments import MPEnv -class HoleReacher(gym.Env): - - def __init__(self, n_links, hole_x, hole_width, hole_depth, allow_self_collision=False, - allow_wall_collision=False, collision_penalty=1000): +class HoleReacher(MPEnv): + def __init__(self, n_links, hole_x: Union[None, float] = None, hole_depth: Union[None, float] = None, + hole_width: float = 1., random_start: bool = True, allow_self_collision: bool = False, + allow_wall_collision: bool = False, collision_penalty: bool = 1000): self.n_links = n_links self.link_lengths = np.ones((n_links, 1)) - # task - self.hole_x = hole_x # x-position of center of hole - self.hole_width = hole_width # width of hole - self.hole_depth = hole_depth # depth of hole + self.random_start = random_start - self.bottom_center_of_hole = np.hstack([hole_x, -hole_depth]) - self.top_center_of_hole = np.hstack([hole_x, 0]) - self.left_wall_edge = np.hstack([hole_x - self.hole_width / 2, 0]) - self.right_wall_edge = np.hstack([hole_x + self.hole_width / 2, 0]) + # provided initial parameters + self._hole_x = hole_x # x-position of center of hole + self._hole_width = hole_width # width of hole + self._hole_depth = hole_depth # depth of hole + + # temp containers to store current setting + self._tmp_hole_x = None + self._tmp_hole_width = None + self._tmp_hole_depth = None # collision self.allow_self_collision = allow_self_collision @@ -29,11 +36,11 @@ class HoleReacher(gym.Env): self.collision_penalty = collision_penalty # state - self._joints = None self._joint_angles = None self._angle_velocity = None - self.start_pos = np.hstack([[np.pi / 2], np.zeros(self.n_links - 1)]) - self.start_vel = np.zeros(self.n_links) + self._joints = None + self._start_pos = np.hstack([[np.pi / 2], np.zeros(self.n_links - 1)]) + self._start_vel = np.zeros(self.n_links) self.dt = 0.01 # self.time_limit = 2 @@ -43,35 +50,64 @@ class HoleReacher(gym.Env): [np.pi] * self.n_links, # cos [np.pi] * self.n_links, # sin [np.inf] * self.n_links, # velocity + [np.inf], # hole width + [np.inf], # hole depth [np.inf] * 2, # x-y coordinates of target distance [np.inf] # env steps, because reward start after n steps TODO: Maybe ]) self.action_space = gym.spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape) self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) + plt.ion() self.fig = None - rect_1 = patches.Rectangle((-self.n_links, -1), - self.n_links + self.hole_x - self.hole_width / 2, 1, - fill=True, edgecolor='k', facecolor='k') - rect_2 = patches.Rectangle((self.hole_x + self.hole_width / 2, -1), - self.n_links - self.hole_x + self.hole_width / 2, 1, - fill=True, edgecolor='k', facecolor='k') - rect_3 = patches.Rectangle((self.hole_x - self.hole_width / 2, -1), self.hole_width, - 1 - self.hole_depth, - fill=True, edgecolor='k', facecolor='k') - self.patches = [rect_1, rect_2, rect_3] + self.seed() + + @property + def corrected_obs_index(self): + return np.hstack([ + [self.random_start] * self.n_links, # cos + [self.random_start] * self.n_links, # sin + [self.random_start] * self.n_links, # velocity + [self._hole_width is None], # hole width + [self._hole_depth is None], # hole width + [True] * 2, # x-y coordinates of target distance + [False] # env steps + ]) + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] @property def end_effector(self): return self._joints[self.n_links].T - def configure(self, context): - pass + def _generate_hole(self): + hole_x = self.np_random.uniform(0.5, 3.5, 1) if self._hole_x is None else np.copy(self._hole_x) + hole_width = self.np_random.uniform(0.5, 0.1, 1) if self._hole_width is None else np.copy(self._hole_width) + # TODO we do not want this right now. + hole_depth = self.np_random.uniform(1, 1, 1) if self._hole_depth is None else np.copy(self._hole_depth) + + self.bottom_center_of_hole = np.hstack([hole_x, -hole_depth]) + self.top_center_of_hole = np.hstack([hole_x, 0]) + self.left_wall_edge = np.hstack([hole_x - hole_width / 2, 0]) + self.right_wall_edge = np.hstack([hole_x + hole_width / 2, 0]) + + return hole_x, hole_width, hole_depth def reset(self): - self._joint_angles = self.start_pos - self._angle_velocity = self.start_vel + if self.random_start: + # MAybe change more than dirst seed + first_joint = self.np_random.uniform(np.pi / 4, 3 * np.pi / 4) + self._joint_angles = np.hstack([[first_joint], np.zeros(self.n_links - 1)]) + else: + self._joint_angles = self._start_pos + + self._tmp_hole_x, self._tmp_hole_width, self._tmp_hole_depth = self._generate_hole() + self.set_patches() + + self._angle_velocity = self._start_vel self._joints = np.zeros((self.n_links + 1, 2)) self._update_joints() self._steps = 0 @@ -96,15 +132,14 @@ class HoleReacher(gym.Env): success = False reward = 0 if not self._is_collided: + # return reward only in last time step if self._steps == 199: dist = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) reward = - dist ** 2 success = dist < 0.005 else: + # Episode terminates when colliding, hence return reward dist = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) - # if self.collision_penalty != 0: - # reward = -self.collision_penalty - # else: reward = - dist ** 2 - self.collision_penalty reward -= 5e-8 * np.sum(acc ** 2) @@ -112,8 +147,6 @@ class HoleReacher(gym.Env): info = {"is_collided": self._is_collided, "is_success": success} self._steps += 1 - - # done = self._steps * self.dt > self.time_limit or self._is_collided done = self._is_collided return self._get_obs().copy(), reward, done, info @@ -148,6 +181,8 @@ class HoleReacher(gym.Env): np.cos(theta), np.sin(theta), self._angle_velocity, + self._hole_width, + self._hole_depth, self.end_effector - self.bottom_center_of_hole, self._steps ]) @@ -155,31 +190,26 @@ class HoleReacher(gym.Env): def get_forward_kinematics(self, num_points_per_link=1): theta = self._joint_angles[:, None] - if num_points_per_link > 1: - intermediate_points = np.linspace(0, 1, num_points_per_link) - else: - intermediate_points = 1 - + intermediate_points = np.linspace(0, 1, num_points_per_link) if num_points_per_link > 1 else 1 accumulated_theta = np.cumsum(theta, axis=0) - - endeffector = np.zeros(shape=(self.n_links, num_points_per_link, 2)) + end_effector = np.zeros(shape=(self.n_links, num_points_per_link, 2)) x = np.cos(accumulated_theta) * self.link_lengths * intermediate_points y = np.sin(accumulated_theta) * self.link_lengths * intermediate_points - endeffector[0, :, 0] = x[0, :] - endeffector[0, :, 1] = y[0, :] + end_effector[0, :, 0] = x[0, :] + end_effector[0, :, 1] = y[0, :] for i in range(1, self.n_links): - endeffector[i, :, 0] = x[i, :] + endeffector[i - 1, -1, 0] - endeffector[i, :, 1] = y[i, :] + endeffector[i - 1, -1, 1] + end_effector[i, :, 0] = x[i, :] + end_effector[i - 1, -1, 0] + end_effector[i, :, 1] = y[i, :] + end_effector[i - 1, -1, 1] - return np.squeeze(endeffector + self._joints[0, :]) + return np.squeeze(end_effector + self._joints[0, :]) def check_wall_collision(self, line_points): # all points that are before the hole in x - r, c = np.where(line_points[:, :, 0] < (self.hole_x - self.hole_width / 2)) + r, c = np.where(line_points[:, :, 0] < (self._tmp_hole_x - self._tmp_hole_width / 2)) # check if any of those points are below surface nr_line_points_below_surface_before_hole = np.sum(line_points[r, c, 1] < 0) @@ -188,7 +218,7 @@ class HoleReacher(gym.Env): return True # all points that are after the hole in x - r, c = np.where(line_points[:, :, 0] > (self.hole_x + self.hole_width / 2)) + r, c = np.where(line_points[:, :, 0] > (self._tmp_hole_x + self._tmp_hole_width / 2)) # check if any of those points are below surface nr_line_points_below_surface_after_hole = np.sum(line_points[r, c, 1] < 0) @@ -197,11 +227,11 @@ class HoleReacher(gym.Env): return True # all points that are above the hole - r, c = np.where((line_points[:, :, 0] > (self.hole_x - self.hole_width / 2)) & ( - line_points[:, :, 0] < (self.hole_x + self.hole_width / 2))) + r, c = np.where((line_points[:, :, 0] > (self._tmp_hole_x - self._tmp_hole_width / 2)) & ( + line_points[:, :, 0] < (self._tmp_hole_x + self._tmp_hole_width / 2))) # check if any of those points are below surface - nr_line_points_below_surface_in_hole = np.sum(line_points[r, c, 1] < -self.hole_depth) + nr_line_points_below_surface_in_hole = np.sum(line_points[r, c, 1] < -self._tmp_hole_depth) if nr_line_points_below_surface_in_hole > 0: return True @@ -210,28 +240,33 @@ class HoleReacher(gym.Env): def render(self, mode='human'): if self.fig is None: + plt.ion() self.fig = plt.figure() - # plt.ion() - # plt.pause(0.01) - else: - plt.figure(self.fig.number) + ax = self.fig.add_subplot(1, 1, 1) + + # limits + lim = np.sum(self.link_lengths) + 0.5 + ax.set_xlim([-lim, lim]) + ax.set_ylim([-1.1, lim]) + + self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') + self.set_patches() + self.fig.show() if mode == "human": - plt.cla() - plt.title(f"Iteration: {self._steps}, distance: {self.end_effector - self.bottom_center_of_hole}") + self.fig.gca().set_title( + f"Iteration: {self._steps}, distance: {self.end_effector - self.bottom_center_of_hole}") # Arm plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') - # Add the patch to the Axes - [plt.gca().add_patch(rect) for rect in self.patches] + # Arm + self.line.set_xdata(self._joints[:, 0]) + self.line.set_ydata(self._joints[:, 1]) - lim = np.sum(self.link_lengths) + 0.5 - plt.xlim([-lim, lim]) - plt.ylim([-1.1, lim]) - # plt.draw() - plt.pause(1e-4) # pushes window to foreground, which is annoying. - # self.fig.canvas.flush_events() + self.fig.canvas.draw() + self.fig.canvas.flush_events() + # self.fig.show() elif mode == "partial": if self._steps == 1: @@ -266,6 +301,24 @@ class HoleReacher(gym.Env): plt.pause(0.01) + def set_patches(self): + if self.fig is not None: + self.fig.gca().patches = [] + rect_1 = patches.Rectangle((-self.n_links, -1), self.n_links + self._tmp_hole_x - self._tmp_hole_width / 2, + 1, + fill=True, edgecolor='k', facecolor='k') + rect_2 = patches.Rectangle((self._tmp_hole_x + self._tmp_hole_width / 2, -1), + self.n_links - self._tmp_hole_x + self._tmp_hole_width / 2, 1, + fill=True, edgecolor='k', facecolor='k') + rect_3 = patches.Rectangle((self._tmp_hole_x - self._tmp_hole_width / 2, -1), self._tmp_hole_width, + 1 - self._tmp_hole_depth, + fill=True, edgecolor='k', facecolor='k') + + # Add the patch to the Axes + self.fig.gca().add_patch(rect_1) + self.fig.gca().add_patch(rect_2) + self.fig.gca().add_patch(rect_3) + def close(self): if self.fig is not None: plt.close(self.fig) @@ -274,8 +327,8 @@ class HoleReacher(gym.Env): if __name__ == '__main__': nl = 5 render_mode = "human" # "human" or "partial" or "final" - env = HoleReacher(n_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, - hole_depth=1, hole_x=1) + env = HoleReacher(n_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=None, + hole_depth=1, hole_x=None) env.reset() # env.render(mode=render_mode) @@ -285,11 +338,13 @@ if __name__ == '__main__': ac = 2 * env.action_space.sample() # ac[0] += np.pi/2 obs, rew, d, info = env.step(ac) - env.render(mode=render_mode) + # if i % 1 == 0: + if i == 0: + env.render(mode=render_mode) print(rew) if d: - break + env.reset() env.close() diff --git a/alr_envs/mujoco/ball_in_a_cup/utils.py b/alr_envs/mujoco/ball_in_a_cup/utils.py index bfec3cf..d368e45 100644 --- a/alr_envs/mujoco/ball_in_a_cup/utils.py +++ b/alr_envs/mujoco/ball_in_a_cup/utils.py @@ -1,5 +1,5 @@ -from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper -from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper +from alr_envs.utils.mps.detpmp_wrapper import DetPMPWrapper +from alr_envs.utils.mps.dmp_wrapper import DmpWrapper from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv diff --git a/alr_envs/mujoco/beerpong/utils.py b/alr_envs/mujoco/beerpong/utils.py index bfbc2a1..cdcbd13 100644 --- a/alr_envs/mujoco/beerpong/utils.py +++ b/alr_envs/mujoco/beerpong/utils.py @@ -1,4 +1,4 @@ -from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper +from alr_envs.utils.mps.detpmp_wrapper import DetPMPWrapper from alr_envs.mujoco.beerpong.beerpong import ALRBeerpongEnv from alr_envs.mujoco.beerpong.beerpong_simple import ALRBeerpongEnv as ALRBeerpongEnvSimple diff --git a/alr_envs/utils/legacy/utils.py b/alr_envs/utils/legacy/utils.py index c158cae..fbcb34a 100644 --- a/alr_envs/utils/legacy/utils.py +++ b/alr_envs/utils/legacy/utils.py @@ -1,7 +1,7 @@ import alr_envs.classic_control.hole_reacher as hr import alr_envs.classic_control.viapoint_reacher as vpr -from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper -from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper +from alr_envs.utils.mps.dmp_wrapper import DmpWrapper +from alr_envs.utils.mps.detpmp_wrapper import DetPMPWrapper import numpy as np @@ -65,7 +65,7 @@ def make_holereacher_env(rank, seed=0): dt=_env.dt, learn_goal=True, alpha_phase=2, - start_pos=_env.start_pos, + start_pos=_env._start_pos, policy_type="velocity", weights_scale=50, goal_scale=0.1 @@ -105,7 +105,7 @@ def make_holereacher_fix_goal_env(rank, seed=0): learn_goal=False, final_pos=np.array([2.02669572, -1.25966385, -1.51618198, -0.80946476, 0.02012344]), alpha_phase=2, - start_pos=_env.start_pos, + start_pos=_env._start_pos, policy_type="velocity", weights_scale=50, goal_scale=1 @@ -142,7 +142,7 @@ def make_holereacher_env_pmp(rank, seed=0): num_basis=5, width=0.02, policy_type="velocity", - start_pos=_env.start_pos, + start_pos=_env._start_pos, duration=2, post_traj_time=0, dt=_env.dt, diff --git a/alr_envs/utils/make_env_helpers.py b/alr_envs/utils/make_env_helpers.py index c0e55b4..d455496 100644 --- a/alr_envs/utils/make_env_helpers.py +++ b/alr_envs/utils/make_env_helpers.py @@ -1,5 +1,5 @@ -from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper -from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper +from alr_envs.utils.mps.dmp_wrapper import DmpWrapper +from alr_envs.utils.mps.detpmp_wrapper import DetPMPWrapper import gym from gym.vector.utils import write_to_shared_memory import sys diff --git a/alr_envs/utils/wrapper/__init__.py b/alr_envs/utils/mps/__init__.py similarity index 100% rename from alr_envs/utils/wrapper/__init__.py rename to alr_envs/utils/mps/__init__.py diff --git a/alr_envs/utils/wrapper/detpmp_wrapper.py b/alr_envs/utils/mps/detpmp_wrapper.py similarity index 66% rename from alr_envs/utils/wrapper/detpmp_wrapper.py rename to alr_envs/utils/mps/detpmp_wrapper.py index 62b93d5..63de98b 100644 --- a/alr_envs/utils/wrapper/detpmp_wrapper.py +++ b/alr_envs/utils/mps/detpmp_wrapper.py @@ -2,17 +2,18 @@ import gym import numpy as np from mp_lib import det_promp -from alr_envs.utils.wrapper.mp_wrapper import MPWrapper +from alr_envs.utils.mps.mp_environments import MPEnv +from alr_envs.utils.mps.mp_wrapper import MPWrapper class DetPMPWrapper(MPWrapper): - def __init__(self, env, num_dof, num_basis, width, start_pos=None, duration=1, dt=0.01, post_traj_time=0., - policy_type=None, weights_scale=1, zero_start=False, zero_goal=False, **mp_kwargs): + def __init__(self, env: MPEnv, num_dof: int, num_basis: int, width: int, start_pos=None, duration: int = 1, + dt: float = 0.01, post_traj_time: float = 0., policy_type: str = None, weights_scale: float = 1., + zero_start: bool = False, zero_goal: bool = False, **mp_kwargs): # self.duration = duration # seconds - super().__init__(env, num_dof, duration, dt, post_traj_time, policy_type, weights_scale, - num_basis=num_basis, width=width, start_pos=start_pos, zero_start=zero_start, - zero_goal=zero_goal) + super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, num_basis=num_basis, + width=width, start_pos=start_pos, zero_start=zero_start, zero_goal=zero_goal, **mp_kwargs) action_bounds = np.inf * np.ones((self.mp.n_basis * self.mp.n_dof)) self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) diff --git a/alr_envs/utils/wrapper/dmp_wrapper.py b/alr_envs/utils/mps/dmp_wrapper.py similarity index 53% rename from alr_envs/utils/wrapper/dmp_wrapper.py rename to alr_envs/utils/mps/dmp_wrapper.py index 2a198db..e42205e 100644 --- a/alr_envs/utils/wrapper/dmp_wrapper.py +++ b/alr_envs/utils/mps/dmp_wrapper.py @@ -1,19 +1,18 @@ -from mp_lib.phase import ExpDecayPhaseGenerator -from mp_lib.basis import DMPBasisGenerator -from mp_lib import dmps -import numpy as np import gym +import numpy as np +from mp_lib import dmps +from mp_lib.basis import DMPBasisGenerator +from mp_lib.phase import ExpDecayPhaseGenerator -from alr_envs.utils.wrapper.mp_wrapper import MPWrapper +from alr_envs.utils.mps.mp_environments import MPEnv +from alr_envs.utils.mps.mp_wrapper import MPWrapper class DmpWrapper(MPWrapper): - def __init__(self, env: gym.Env, num_dof: int, num_basis: int, - # start_pos: np.ndarray = None, - # final_pos: np.ndarray = None, + def __init__(self, env: MPEnv, num_dof: int, num_basis: int, duration: int = 1, alpha_phase: float = 2., dt: float = None, - learn_goal: bool = False, return_to_start: bool = False, post_traj_time: float = 0., + learn_goal: bool = False, post_traj_time: float = 0., weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3., policy_type: str = None, render_mode: str = None): @@ -23,8 +22,6 @@ class DmpWrapper(MPWrapper): env: num_dof: num_basis: - start_pos: - final_pos: duration: alpha_phase: dt: @@ -37,30 +34,17 @@ class DmpWrapper(MPWrapper): self.learn_goal = learn_goal dt = env.dt if hasattr(env, "dt") else dt assert dt is not None - # start_pos = start_pos if start_pos is not None else env.start_pos if hasattr(env, "start_pos") else None - # TODO: assert start_pos is not None # start_pos will be set in initialize, do we need this here? - # if learn_goal: - # final_pos = np.zeros_like(start_pos) # arbitrary, will be learned - # final_pos = np.zeros((1, num_dof)) # arbitrary, will be learned - # else: - # final_pos = final_pos if final_pos is not None else start_pos if return_to_start else None - # assert final_pos is not None self.t = np.linspace(0, duration, int(duration / dt)) self.goal_scale = goal_scale - super().__init__(env, num_dof, duration, dt, post_traj_time, policy_type, weights_scale, render_mode, - num_basis=num_basis, - # start_pos=start_pos, final_pos=final_pos, - alpha_phase=alpha_phase, - bandwidth_factor=bandwidth_factor) + super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, render_mode, + num_basis=num_basis, alpha_phase=alpha_phase, bandwidth_factor=bandwidth_factor) action_bounds = np.inf * np.ones((np.prod(self.mp.dmp_weights.shape) + (num_dof if learn_goal else 0))) self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) - def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, - # start_pos: np.ndarray = None, - # final_pos: np.ndarray = None, - alpha_phase: float = 2., bandwidth_factor: float = 3.): + def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, alpha_phase: float = 2., + bandwidth_factor: int = 3): phase_generator = ExpDecayPhaseGenerator(alpha_phase=alpha_phase, duration=duration) basis_generator = DMPBasisGenerator(phase_generator, duration=duration, num_basis=num_basis, @@ -69,15 +53,6 @@ class DmpWrapper(MPWrapper): dmp = dmps.DMP(num_dof=num_dof, basis_generator=basis_generator, phase_generator=phase_generator, num_time_steps=int(duration / dt), dt=dt) - # dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) - # in a contextual environment, the start_pos may be not fixed, set in mp_rollout? - # TODO: Should we set start_pos in init at all? It's only used after calling rollout anyway... - # dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) if start_pos is not None else np.zeros((1, num_dof)) - - # weights = np.zeros((num_basis, num_dof)) - # goal_pos = np.zeros(num_dof) if self.learn_goal else final_pos - - # dmp.set_weights(weights, goal_pos) return dmp def goal_and_weights(self, params): @@ -87,18 +62,15 @@ class DmpWrapper(MPWrapper): if self.learn_goal: goal_pos = params[0, -self.mp.num_dimensions:] # [num_dof] params = params[:, :-self.mp.num_dimensions] # [1,num_dof] - # weight_matrix = np.reshape(params[:, :-self.num_dof], [self.num_basis, self.num_dof]) else: goal_pos = self.env.goal_pos # self.mp.dmp_goal_pos.flatten() assert goal_pos is not None - # weight_matrix = np.reshape(params, [self.num_basis, self.num_dof]) - weight_matrix = np.reshape(params, self.mp.dmp_weights.shape) + weight_matrix = np.reshape(params, self.mp.dmp_weights.shape) # [num_basis, num_dof] return goal_pos * self.goal_scale, weight_matrix * self.weights_scale def mp_rollout(self, action): - # if self.mp.start_pos is None: - self.mp.dmp_start_pos = self.env.init_qpos # start_pos + self.mp.dmp_start_pos = self.env.start_pos goal_pos, weight_matrix = self.goal_and_weights(action) self.mp.set_weights(weight_matrix, goal_pos) return self.mp.reference_trajectory(self.t) diff --git a/alr_envs/utils/mps/mp_environments.py b/alr_envs/utils/mps/mp_environments.py new file mode 100644 index 0000000..f720f2f --- /dev/null +++ b/alr_envs/utils/mps/mp_environments.py @@ -0,0 +1,33 @@ +from abc import abstractmethod +from typing import Union + +import gym +import numpy as np + + +class MPEnv(gym.Env): + + @property + @abstractmethod + def corrected_obs_index(self): + """Returns boolean value for each observation entry + whether the observation is returned by the DMP for the contextual case or not. + This effectively allows to filter unwanted or unnecessary observations from the full step-based case. + """ + raise NotImplementedError() + + @property + @abstractmethod + def start_pos(self) -> Union[float, int, np.ndarray]: + """ + Returns the current position of the joints + """ + raise NotImplementedError() + + @property + def goal_pos(self) -> Union[float, int, np.ndarray]: + """ + Returns the current final position of the joints for the MP. + By default this returns the starting position. + """ + return self.start_pos diff --git a/alr_envs/utils/wrapper/mp_wrapper.py b/alr_envs/utils/mps/mp_wrapper.py similarity index 68% rename from alr_envs/utils/wrapper/mp_wrapper.py rename to alr_envs/utils/mps/mp_wrapper.py index adeba55..621de00 100644 --- a/alr_envs/utils/wrapper/mp_wrapper.py +++ b/alr_envs/utils/mps/mp_wrapper.py @@ -1,32 +1,18 @@ from abc import ABC, abstractmethod -from collections import defaultdict import gym import numpy as np +from alr_envs.utils.mps.mp_environments import MPEnv from alr_envs.utils.policies import get_policy_class class MPWrapper(gym.Wrapper, ABC): - def __init__(self, - env: gym.Env, - num_dof: int, - duration: int = 1, - dt: float = None, - post_traj_time: float = 0., - policy_type: str = None, - weights_scale: float = 1., - render_mode: str = None, - **mp_kwargs - ): + def __init__(self, env: MPEnv, num_dof: int, dt: float, duration: int = 1, post_traj_time: float = 0., + policy_type: str = None, weights_scale: float = 1., render_mode: str = None, **mp_kwargs): super().__init__(env) - # self.num_dof = num_dof - # self.num_basis = num_basis - # self.duration = duration # seconds - - # dt = env.dt if hasattr(env, "dt") else dt assert dt is not None # this should never happen as MPWrapper is a base class self.post_traj_steps = int(post_traj_time / dt) @@ -40,8 +26,11 @@ class MPWrapper(gym.Wrapper, ABC): self.render_mode = render_mode self.render_kwargs = {} - # TODO: not yet final + # TODO: @Max I think this should not be in this class, this functionality should be part of your sampler. def __call__(self, params, contexts=None): + """ + Can be used to provide a batch of parameter sets + """ params = np.atleast_2d(params) obs = [] rewards = [] @@ -63,7 +52,7 @@ class MPWrapper(gym.Wrapper, ABC): def reset(self): obs = self.env.reset() - return obs + return obs[self.env] def step(self, action: np.ndarray): """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" @@ -77,15 +66,9 @@ class MPWrapper(gym.Wrapper, ABC): # self._velocity = velocity rewards = 0 - # infos = defaultdict(list) - - # TODO: @Max Why do we need this configure, states should be part of the model - # TODO: Ask Onur if the context distribution needs to be outside the environment - # TODO: For now create a new env with each context - # TODO: Explicitly call reset before step to obtain context from obs? - # self.env.configure(context) - # obs = self.env.reset() info = {} + # create random obs as the reset function is called externally + obs = self.env.observation_space.sample() for t, pos_vel in enumerate(zip(trajectory, velocity)): ac = self.policy.get_action(pos_vel[0], pos_vel[1]) @@ -107,18 +90,6 @@ class MPWrapper(gym.Wrapper, ABC): self.render_mode = mode self.render_kwargs = kwargs - # def __call__(self, actions): - # return self.step(actions) - # params = np.atleast_2d(params) - # rewards = [] - # infos = [] - # for p, c in zip(params, contexts): - # reward, info = self.rollout(p, c) - # rewards.append(reward) - # infos.append(info) - # - # return np.array(rewards), infos - @abstractmethod def mp_rollout(self, action): """ From 6ae195962c4d0de08f901103caf7a5a807608055 Mon Sep 17 00:00:00 2001 From: ottofabian Date: Wed, 12 May 2021 17:48:57 +0200 Subject: [PATCH 04/11] adjusted classic control environments to new interface --- alr_envs/__init__.py | 147 +++++----- alr_envs/classic_control/__init__.py | 2 +- .../episodic_simple_reacher.py | 2 +- alr_envs/classic_control/hole_reacher.py | 259 ++++++++---------- alr_envs/classic_control/simple_reacher.py | 222 +++++++++------ alr_envs/classic_control/viapoint_reacher.py | 206 +++++++++----- alr_envs/mujoco/ball_in_a_cup/utils.py | 46 +--- alr_envs/mujoco/beerpong/utils.py | 45 +-- alr_envs/utils/legacy/utils.py | 57 ++-- alr_envs/utils/mps/detpmp_wrapper.py | 14 +- alr_envs/utils/mps/dmp_wrapper.py | 2 +- alr_envs/utils/mps/mp_environments.py | 2 +- alr_envs/utils/mps/mp_wrapper.py | 11 +- example.py | 9 +- 14 files changed, 535 insertions(+), 489 deletions(-) diff --git a/alr_envs/__init__.py b/alr_envs/__init__.py index fe32c0b..17d5541 100644 --- a/alr_envs/__init__.py +++ b/alr_envs/__init__.py @@ -1,6 +1,7 @@ from gym.envs.registration import register from alr_envs.stochastic_search.functions.f_rosenbrock import Rosenbrock + # from alr_envs.utils.mps.dmp_wrapper import DmpWrapper # Mujoco @@ -71,6 +72,17 @@ register( } ) +## Balancing Reacher + +register( + id='Balancing-v0', + entry_point='alr_envs.mujoco:BalancingEnv', + max_episode_steps=200, + kwargs={ + "n_links": 5, + } +) + register( id='ALRBallInACupSimple-v0', entry_point='alr_envs.mujoco:ALRBallInACupEnv', @@ -101,15 +113,7 @@ register( # Classic control -register( - id='Balancing-v0', - entry_point='alr_envs.mujoco:BalancingEnv', - max_episode_steps=200, - kwargs={ - "n_links": 5, - } -) - +## Simple Reacher register( id='SimpleReacher-v0', entry_point='alr_envs.classic_control:SimpleReacherEnv', @@ -129,25 +133,6 @@ register( } ) -register( - id='EpisodicSimpleReacher-v0', - entry_point='alr_envs.classic_control:EpisodicSimpleReacherEnv', - max_episode_steps=200, - kwargs={ - "n_links": 2, - } -) - -register( - id='EpisodicSimpleReacher-v1', - entry_point='alr_envs.classic_control:EpisodicSimpleReacherEnv', - max_episode_steps=200, - kwargs={ - "n_links": 2, - "random_start": False - } -) - register( id='LongSimpleReacher-v0', entry_point='alr_envs.classic_control:SimpleReacherEnv', @@ -157,6 +142,18 @@ register( } ) +register( + id='LongSimpleReacher-v1', + entry_point='alr_envs.classic_control:SimpleReacherEnv', + max_episode_steps=200, + kwargs={ + "n_links": 5, + "random_start": False + } +) + +## Viapoint Reacher + register( id='ViaPointReacher-v0', entry_point='alr_envs.classic_control.viapoint_reacher:ViaPointReacher', @@ -168,27 +165,45 @@ register( } ) +## Hole Reacher register( id='HoleReacher-v0', - entry_point='alr_envs.classic_control.hole_reacher:HoleReacher', + entry_point='alr_envs.classic_control.hole_reacher:HoleReacherEnv', max_episode_steps=200, kwargs={ "n_links": 5, "allow_self_collision": False, "allow_wall_collision": False, - "hole_width": 0.25, + "hole_width": None, "hole_depth": 1, - "hole_x": 2, + "hole_x": None, + "collision_penalty": 100, + } +) + +register( + id='HoleReacher-v1', + entry_point='alr_envs.classic_control.hole_reacher:HoleReacherEnv', + max_episode_steps=200, + kwargs={ + "n_links": 5, + "random_start": False, + "allow_self_collision": False, + "allow_wall_collision": False, + "hole_width": None, + "hole_depth": 1, + "hole_x": None, "collision_penalty": 100, } ) register( id='HoleReacher-v2', - entry_point='alr_envs.classic_control.hole_reacher_v2:HoleReacher', + entry_point='alr_envs.classic_control.hole_reacher:HoleReacherEnv', max_episode_steps=200, kwargs={ "n_links": 5, + "random_start": False, "allow_self_collision": False, "allow_wall_collision": False, "hole_width": 0.25, @@ -199,38 +214,24 @@ register( ) # MP environments - -register( - id='SimpleReacherDMP-v0', - entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', - # max_episode_steps=1, - kwargs={ - "name": "alr_envs:EpisodicSimpleReacher-v0", - "num_dof": 2, - "num_basis": 5, - "duration": 2, - "alpha_phase": 2, - "learn_goal": True, - "policy_type": "velocity", - "weights_scale": 50, - } -) - -register( - id='SimpleReacherDMP-v1', - entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', - # max_episode_steps=1, - kwargs={ - "name": "alr_envs:EpisodicSimpleReacher-v1", - "num_dof": 2, - "num_basis": 5, - "duration": 2, - "alpha_phase": 2, - "learn_goal": True, - "policy_type": "velocity", - "weights_scale": 50, - } -) +reacher_envs = ["SimpleReacher-v0", "SimpleReacher-v1", "LongSimpleReacher-v0", "LongSimpleReacher-v1"] +for env in reacher_envs: + name = env.split("-") + register( + id=f'{name[0]}DMP-{name[1]}', + entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', + # max_episode_steps=1, + kwargs={ + "name": f"alr_envs:{env}", + "num_dof": 2 if "long" not in env.lower() else 5 , + "num_basis": 5, + "duration": 2, + "alpha_phase": 2, + "learn_goal": True, + "policy_type": "velocity", + "weights_scale": 50, + } + ) register( id='ViaPointReacherDMP-v0', @@ -266,6 +267,24 @@ register( } ) +register( + id='HoleReacherDMP-v1', + entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', + # max_episode_steps=1, + kwargs={ + "name": "alr_envs:HoleReacher-v1", + "num_dof": 5, + "num_basis": 5, + "duration": 2, + "learn_goal": True, + "alpha_phase": 2, + "bandwidth_factor": 2, + "policy_type": "velocity", + "weights_scale": 50, + "goal_scale": 0.1 + } +) + register( id='HoleReacherDMP-v2', entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', diff --git a/alr_envs/classic_control/__init__.py b/alr_envs/classic_control/__init__.py index 8087136..fde30fc 100644 --- a/alr_envs/classic_control/__init__.py +++ b/alr_envs/classic_control/__init__.py @@ -1,4 +1,4 @@ from alr_envs.classic_control.simple_reacher import SimpleReacherEnv from alr_envs.classic_control.episodic_simple_reacher import EpisodicSimpleReacherEnv from alr_envs.classic_control.viapoint_reacher import ViaPointReacher -from alr_envs.classic_control.hole_reacher import HoleReacher +from alr_envs.classic_control.hole_reacher import HoleReacherEnv diff --git a/alr_envs/classic_control/episodic_simple_reacher.py b/alr_envs/classic_control/episodic_simple_reacher.py index b02efe8..2e7f604 100644 --- a/alr_envs/classic_control/episodic_simple_reacher.py +++ b/alr_envs/classic_control/episodic_simple_reacher.py @@ -35,7 +35,7 @@ class EpisodicSimpleReacherEnv(SimpleReacherEnv): def _get_obs(self): if self.random_start: - theta = self._joint_angle + theta = self._joint_angles return np.hstack([ np.cos(theta), np.sin(theta), diff --git a/alr_envs/classic_control/hole_reacher.py b/alr_envs/classic_control/hole_reacher.py index 45bc1c3..0e008aa 100644 --- a/alr_envs/classic_control/hole_reacher.py +++ b/alr_envs/classic_control/hole_reacher.py @@ -10,11 +10,12 @@ from alr_envs.classic_control.utils import check_self_collision from alr_envs.utils.mps.mp_environments import MPEnv -class HoleReacher(MPEnv): +class HoleReacherEnv(MPEnv): - def __init__(self, n_links, hole_x: Union[None, float] = None, hole_depth: Union[None, float] = None, - hole_width: float = 1., random_start: bool = True, allow_self_collision: bool = False, + def __init__(self, n_links: int, hole_x: Union[None, float] = None, hole_depth: Union[None, float] = None, + hole_width: float = 1., random_start: bool = False, allow_self_collision: bool = False, allow_wall_collision: bool = False, collision_penalty: bool = 1000): + self.n_links = n_links self.link_lengths = np.ones((n_links, 1)) @@ -25,10 +26,11 @@ class HoleReacher(MPEnv): self._hole_width = hole_width # width of hole self._hole_depth = hole_depth # depth of hole - # temp containers to store current setting + # temp container for current env state self._tmp_hole_x = None self._tmp_hole_width = None self._tmp_hole_depth = None + self._goal = None # x-y coordinates for reaching the center at the bottom of the hole # collision self.allow_self_collision = allow_self_collision @@ -36,14 +38,13 @@ class HoleReacher(MPEnv): self.collision_penalty = collision_penalty # state + self._joints = None self._joint_angles = None self._angle_velocity = None - self._joints = None self._start_pos = np.hstack([[np.pi / 2], np.zeros(self.n_links - 1)]) self._start_vel = np.zeros(self.n_links) self.dt = 0.01 - # self.time_limit = 2 action_bound = np.pi * np.ones((self.n_links,)) state_bound = np.hstack([ @@ -58,54 +59,43 @@ class HoleReacher(MPEnv): self.action_space = gym.spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape) self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) - plt.ion() + # containers for plotting + self.metadata = {'render.modes': ["human", "partial"]} self.fig = None + self._steps = 0 self.seed() - @property - def corrected_obs_index(self): - return np.hstack([ - [self.random_start] * self.n_links, # cos - [self.random_start] * self.n_links, # sin - [self.random_start] * self.n_links, # velocity - [self._hole_width is None], # hole width - [self._hole_depth is None], # hole width - [True] * 2, # x-y coordinates of target distance - [False] # env steps - ]) + def step(self, action: np.ndarray): + """ + A single step with an action in joint velocity space + """ - def seed(self, seed=None): - self.np_random, seed = seeding.np_random(seed) - return [seed] + self._angle_velocity = action + self._joint_angles = self._joint_angles + self.dt * self._angle_velocity + self._update_joints() - @property - def end_effector(self): - return self._joints[self.n_links].T + acc = (action - self._angle_velocity) / self.dt + reward, info = self._get_reward(acc) - def _generate_hole(self): - hole_x = self.np_random.uniform(0.5, 3.5, 1) if self._hole_x is None else np.copy(self._hole_x) - hole_width = self.np_random.uniform(0.5, 0.1, 1) if self._hole_width is None else np.copy(self._hole_width) - # TODO we do not want this right now. - hole_depth = self.np_random.uniform(1, 1, 1) if self._hole_depth is None else np.copy(self._hole_depth) + info.update({"is_collided": self._is_collided}) - self.bottom_center_of_hole = np.hstack([hole_x, -hole_depth]) - self.top_center_of_hole = np.hstack([hole_x, 0]) - self.left_wall_edge = np.hstack([hole_x - hole_width / 2, 0]) - self.right_wall_edge = np.hstack([hole_x + hole_width / 2, 0]) + self._steps += 1 + done = self._is_collided - return hole_x, hole_width, hole_depth + return self._get_obs().copy(), reward, done, info def reset(self): if self.random_start: - # MAybe change more than dirst seed + # Maybe change more than dirst seed first_joint = self.np_random.uniform(np.pi / 4, 3 * np.pi / 4) self._joint_angles = np.hstack([[first_joint], np.zeros(self.n_links - 1)]) + self._start_pos = self._joint_angles.copy() else: self._joint_angles = self._start_pos - self._tmp_hole_x, self._tmp_hole_width, self._tmp_hole_depth = self._generate_hole() - self.set_patches() + self._generate_hole() + self._set_patches() self._angle_velocity = self._start_vel self._joints = np.zeros((self.n_links + 1, 2)) @@ -114,42 +104,14 @@ class HoleReacher(MPEnv): return self._get_obs().copy() - def step(self, action: np.ndarray): - """ - a single step with an action in joint velocity space - """ - vel = action # + 0.01 * np.random.randn(self.num_links) - acc = (vel - self._angle_velocity) / self.dt - self._angle_velocity = vel - self._joint_angles = self._joint_angles + self.dt * self._angle_velocity - - self._update_joints() - - # rew = self._reward() - - # compute reward directly in step function - - success = False - reward = 0 - if not self._is_collided: - # return reward only in last time step - if self._steps == 199: - dist = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) - reward = - dist ** 2 - success = dist < 0.005 - else: - # Episode terminates when colliding, hence return reward - dist = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) - reward = - dist ** 2 - self.collision_penalty - - reward -= 5e-8 * np.sum(acc ** 2) - - info = {"is_collided": self._is_collided, "is_success": success} - - self._steps += 1 - done = self._is_collided - - return self._get_obs().copy(), reward, done, info + def _generate_hole(self): + self._tmp_hole_x = self.np_random.uniform(0.5, 3.5, 1) if self._hole_x is None else np.copy(self._hole_x) + self._tmp_hole_width = self.np_random.uniform(0.5, 0.1, 1) if self._hole_width is None else np.copy( + self._hole_width) + # TODO we do not want this right now. + self._tmp_hole_depth = self.np_random.uniform(1, 1, 1) if self._hole_depth is None else np.copy( + self._hole_depth) + self._goal = np.hstack([self._tmp_hole_x, -self._tmp_hole_depth]) def _update_joints(self): """ @@ -157,7 +119,7 @@ class HoleReacher(MPEnv): Returns: """ - line_points_in_taskspace = self.get_forward_kinematics(num_points_per_link=20) + line_points_in_taskspace = self._get_forward_kinematics(num_points_per_link=20) self._joints[1:, 0] = self._joints[0, 0] + line_points_in_taskspace[:, -1, 0] self._joints[1:, 1] = self._joints[0, 1] + line_points_in_taskspace[:, -1, 1] @@ -171,23 +133,43 @@ class HoleReacher(MPEnv): self_collision = True if not self.allow_wall_collision: - wall_collision = self.check_wall_collision(line_points_in_taskspace) + wall_collision = self._check_wall_collision(line_points_in_taskspace) self._is_collided = self_collision or wall_collision + def _get_reward(self, acc: np.ndarray): + success = False + reward = -np.inf + if not self._is_collided: + dist = 0 + # return reward only in last time step + if self._steps == 199: + dist = np.linalg.norm(self.end_effector - self._goal) + success = dist < 0.005 + else: + # Episode terminates when colliding, hence return reward + dist = np.linalg.norm(self.end_effector - self._goal) + reward = -self.collision_penalty + + reward -= dist ** 2 + reward -= 5e-8 * np.sum(acc ** 2) + info = {"is_success": success} + + return reward, info + def _get_obs(self): theta = self._joint_angles return np.hstack([ np.cos(theta), np.sin(theta), self._angle_velocity, - self._hole_width, - self._hole_depth, - self.end_effector - self.bottom_center_of_hole, + self._tmp_hole_width, + self._tmp_hole_depth, + self.end_effector - self._goal, self._steps ]) - def get_forward_kinematics(self, num_points_per_link=1): + def _get_forward_kinematics(self, num_points_per_link=1): theta = self._joint_angles[:, None] intermediate_points = np.linspace(0, 1, num_points_per_link) if num_points_per_link > 1 else 1 @@ -206,7 +188,7 @@ class HoleReacher(MPEnv): return np.squeeze(end_effector + self._joints[0, :]) - def check_wall_collision(self, line_points): + def _check_wall_collision(self, line_points): # all points that are before the hole in x r, c = np.where(line_points[:, :, 0] < (self._tmp_hole_x - self._tmp_hole_width / 2)) @@ -240,6 +222,7 @@ class HoleReacher(MPEnv): def render(self, mode='human'): if self.fig is None: + # Create base figure once on the beginning. Afterwards only update plt.ion() self.fig = plt.figure() ax = self.fig.add_subplot(1, 1, 1) @@ -250,74 +233,74 @@ class HoleReacher(MPEnv): ax.set_ylim([-1.1, lim]) self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') - self.set_patches() + self._set_patches() self.fig.show() + self.fig.gca().set_title( + f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}") + if mode == "human": - self.fig.gca().set_title( - f"Iteration: {self._steps}, distance: {self.end_effector - self.bottom_center_of_hole}") - # Arm - plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') - - # Arm - self.line.set_xdata(self._joints[:, 0]) - self.line.set_ydata(self._joints[:, 1]) + # arm + self.line.set_data(self._joints[:, 0], self._joints[:, 1]) self.fig.canvas.draw() self.fig.canvas.flush_events() - # self.fig.show() elif mode == "partial": - if self._steps == 1: - # fig, ax = plt.subplots() - # Add the patch to the Axes - [plt.gca().add_patch(rect) for rect in self.patches] - # plt.pause(0.01) - if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided: # Arm - plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k', alpha=self._steps / 200) - # ax.plot(line_points_in_taskspace[:, 0, 0], - # line_points_in_taskspace[:, 0, 1], - # line_points_in_taskspace[:, -1, 0], - # line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200) + plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k', + alpha=self._steps / 200) - lim = np.sum(self.link_lengths) + 0.5 - plt.xlim([-lim, lim]) - plt.ylim([-1.1, lim]) - plt.pause(0.01) - - elif mode == "final": - if self._steps == 199 or self._is_collided: - # fig, ax = plt.subplots() - - # Add the patch to the Axes - [plt.gca().add_patch(rect) for rect in self.patches] - - plt.xlim(-self.n_links, self.n_links), plt.ylim(-1, self.n_links) - # Arm - plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') - - plt.pause(0.01) - - def set_patches(self): + def _set_patches(self): if self.fig is not None: self.fig.gca().patches = [] - rect_1 = patches.Rectangle((-self.n_links, -1), self.n_links + self._tmp_hole_x - self._tmp_hole_width / 2, - 1, - fill=True, edgecolor='k', facecolor='k') - rect_2 = patches.Rectangle((self._tmp_hole_x + self._tmp_hole_width / 2, -1), - self.n_links - self._tmp_hole_x + self._tmp_hole_width / 2, 1, - fill=True, edgecolor='k', facecolor='k') - rect_3 = patches.Rectangle((self._tmp_hole_x - self._tmp_hole_width / 2, -1), self._tmp_hole_width, - 1 - self._tmp_hole_depth, - fill=True, edgecolor='k', facecolor='k') + left_block = patches.Rectangle((-self.n_links, -self._tmp_hole_depth), + self.n_links + self._tmp_hole_x - self._tmp_hole_width / 2, + self._tmp_hole_depth, + fill=True, edgecolor='k', facecolor='k') + right_block = patches.Rectangle((self._tmp_hole_x + self._tmp_hole_width / 2, -self._tmp_hole_depth), + self.n_links - self._tmp_hole_x + self._tmp_hole_width / 2, + self._tmp_hole_depth, + fill=True, edgecolor='k', facecolor='k') + hole_floor = patches.Rectangle((self._tmp_hole_x - self._tmp_hole_width / 2, -self._tmp_hole_depth), + self._tmp_hole_width, + 1 - self._tmp_hole_depth, + fill=True, edgecolor='k', facecolor='k') # Add the patch to the Axes - self.fig.gca().add_patch(rect_1) - self.fig.gca().add_patch(rect_2) - self.fig.gca().add_patch(rect_3) + self.fig.gca().add_patch(left_block) + self.fig.gca().add_patch(right_block) + self.fig.gca().add_patch(hole_floor) + + @property + def active_obs(self): + return np.hstack([ + [self.random_start] * self.n_links, # cos + [self.random_start] * self.n_links, # sin + [self.random_start] * self.n_links, # velocity + [self._hole_width is None], # hole width + [self._hole_depth is None], # hole width + [True] * 2, # x-y coordinates of target distance + [False] # env steps + ]) + + @property + def start_pos(self) -> Union[float, int, np.ndarray]: + return self._start_pos + + @property + def goal_pos(self) -> Union[float, int, np.ndarray]: + raise ValueError("Goal position is not available and has to be learnt based on the environment.") + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + @property + def end_effector(self): + return self._joints[self.n_links].T def close(self): if self.fig is not None: @@ -327,20 +310,16 @@ class HoleReacher(MPEnv): if __name__ == '__main__': nl = 5 render_mode = "human" # "human" or "partial" or "final" - env = HoleReacher(n_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=None, - hole_depth=1, hole_x=None) - env.reset() - # env.render(mode=render_mode) + env = HoleReacherEnv(n_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=None, + hole_depth=1, hole_x=None) + obs = env.reset() for i in range(200): # objective.load_result("/tmp/cma") # test with random actions ac = 2 * env.action_space.sample() - # ac[0] += np.pi/2 obs, rew, d, info = env.step(ac) - # if i % 1 == 0: - if i == 0: - env.render(mode=render_mode) + env.render(mode=render_mode) print(rew) diff --git a/alr_envs/classic_control/simple_reacher.py b/alr_envs/classic_control/simple_reacher.py index 7ca4ead..4e99ff1 100644 --- a/alr_envs/classic_control/simple_reacher.py +++ b/alr_envs/classic_control/simple_reacher.py @@ -1,24 +1,21 @@ -import gym +from typing import Iterable, Union + import matplotlib.pyplot as plt import numpy as np from gym import spaces from gym.utils import seeding -from alr_envs.utils.utils import angle_normalize +from alr_envs.utils.mps.mp_environments import MPEnv -# if os.environ.get("DISPLAY", None): -# mpl.use('Qt5Agg') - - -class SimpleReacherEnv(gym.Env): +class SimpleReacherEnv(MPEnv): """ Simple Reaching Task without any physics simulation. Returns no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions towards the end of the trajectory. """ - def __init__(self, n_links, random_start=True): + def __init__(self, n_links: int, target: Union[None, Iterable] = None, random_start: bool = True): super().__init__() self.link_lengths = np.ones(n_links) self.n_links = n_links @@ -26,17 +23,19 @@ class SimpleReacherEnv(gym.Env): self.random_start = random_start - self._goal = None - self._joints = None - self._joint_angle = None + self._joint_angles = None self._angle_velocity = None - self._start_pos = None + self._start_pos = np.zeros(self.n_links) + self._start_vel = np.zeros(self.n_links) - self.max_torque = 1 # 10 + self._target = target # provided target value + self._goal = None # updated goal value, does not change when target != None + + self.max_torque = 1 self.steps_before_reward = 199 - action_bound = np.ones((self.n_links,)) + action_bound = np.ones((self.n_links,)) * self.max_torque state_bound = np.hstack([ [np.pi] * self.n_links, # cos [np.pi] * self.n_links, # sin @@ -47,49 +46,50 @@ class SimpleReacherEnv(gym.Env): self.action_space = spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape) self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) - self.fig = None + # containers for plotting self.metadata = {'render.modes': ["human"]} + self.fig = None self._steps = 0 self.seed() def step(self, action: np.ndarray): + """ + A single step with action in torque space + """ # action = self._add_action_noise(action) - action = np.clip(action, -self.max_torque, self.max_torque) + ac = np.clip(action, -self.max_torque, self.max_torque) - self._angle_velocity = self._angle_velocity + self.dt * action - self._joint_angle = angle_normalize(self._joint_angle + self.dt * self._angle_velocity) + self._angle_velocity = self._angle_velocity + self.dt * ac + self._joint_angles = self._joint_angles + self.dt * self._angle_velocity self._update_joints() - self._steps += 1 reward, info = self._get_reward(action) - # done = np.abs(self.end_effector - self._goal_pos) < 0.1 + self._steps += 1 done = False return self._get_obs().copy(), reward, done, info - def _add_action_noise(self, action: np.ndarray): - """ - add unobserved Gaussian Noise N(0,0.01) to the actions - Args: - action: + def reset(self): - Returns: actions with noise + # TODO: maybe do initialisation more random? + # Sample only orientation of first link, i.e. the arm is always straight. + if self.random_start: + self._joint_angles = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)]) + self._start_pos = self._joint_angles.copy() + else: + self._joint_angles = self._start_pos - """ - return self.np_random.normal(0, 0.1, *action.shape) + action + self._generate_goal() - def _get_obs(self): - theta = self._joint_angle - return np.hstack([ - np.cos(theta), - np.sin(theta), - self._angle_velocity, - self.end_effector - self._goal, - self._steps - ]) + self._angle_velocity = self._start_vel + self._joints = np.zeros((self.n_links + 1, 2)) + self._update_joints() + self._steps = 0 + + return self._get_obs().copy() def _update_joints(self): """ @@ -97,7 +97,7 @@ class SimpleReacherEnv(gym.Env): Returns: """ - angles = np.cumsum(self._joint_angle) + angles = np.cumsum(self._joint_angles) x = self.link_lengths * np.vstack([np.cos(angles), np.sin(angles)]) self._joints[1:] = self._joints[0] + np.cumsum(x.T, axis=0) @@ -105,7 +105,6 @@ class SimpleReacherEnv(gym.Env): diff = self.end_effector - self._goal reward_dist = 0 - # TODO: Is this the best option if self._steps >= self.steps_before_reward: reward_dist -= np.linalg.norm(diff) # reward_dist = np.exp(-0.1 * diff ** 2).mean() @@ -115,67 +114,118 @@ class SimpleReacherEnv(gym.Env): reward = reward_dist - reward_ctrl return reward, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl) - def reset(self): + def _get_obs(self): + theta = self._joint_angles + return np.hstack([ + np.cos(theta), + np.sin(theta), + self._angle_velocity, + self.end_effector - self._goal, + self._steps + ]) - # TODO: maybe do initialisation more random? - # Sample only orientation of first link, i.e. the arm is always straight. - if self.random_start: - self._joint_angle = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)]) + def _generate_goal(self): + + if self._target is None: + # center = self._joints[0] + # # Sample uniformly in circle with radius R around center of reacher. + # R = np.sum(self.link_lengths) + # r = R * np.sqrt(self.np_random.uniform()) + # theta = self.np_random.uniform() * 2 * np.pi + # goal = center + r * np.stack([np.cos(theta), np.sin(theta)]) + + total_length = np.sum(self.link_lengths) + goal = np.array([total_length, total_length]) + while np.linalg.norm(goal) >= total_length: + goal = self.np_random.uniform(low=-total_length, high=total_length, size=2) else: - self._joint_angle = np.zeros(self.n_links) + goal = np.copy(self._target) - self._start_pos = self._joint_angle - self._angle_velocity = np.zeros(self.n_links) - self._joints = np.zeros((self.n_links + 1, 2)) - self._update_joints() - self._steps = 0 + self._goal = goal - self._goal = self._get_random_goal() - return self._get_obs().copy() + def render(self, mode='human'): # pragma: no cover + if self.fig is None: + # Create base figure once on the beginning. Afterwards only update + plt.ion() + self.fig = plt.figure() + ax = self.fig.add_subplot(1, 1, 1) - def _get_random_goal(self): - center = self._joints[0] + # limits + lim = np.sum(self.link_lengths) + 0.5 + ax.set_xlim([-lim, lim]) + ax.set_ylim([-lim, lim]) - # Sample uniformly in circle with radius R around center of reacher. - R = np.sum(self.link_lengths) - r = R * np.sqrt(self.np_random.uniform()) - theta = self.np_random.uniform() * 2 * np.pi - return center + r * np.stack([np.cos(theta), np.sin(theta)]) + self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') + goal_pos = self._goal.T + self.goal_point, = ax.plot(goal_pos[0], goal_pos[1], 'gx') + self.goal_dist, = ax.plot([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]], 'g--') + + self.fig.show() + + self.fig.gca().set_title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}") + + # goal + goal_pos = self._goal.T + if self._steps == 1: + self.goal_point.set_data(goal_pos[0], goal_pos[1]) + + # arm + self.line.set_data(self._joints[:, 0], self._joints[:, 1]) + + # distance between end effector and goal + self.goal_dist.set_data([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]]) + + self.fig.canvas.draw() + self.fig.canvas.flush_events() + + @property + def active_obs(self): + return np.hstack([ + [self.random_start] * self.n_links, # cos + [self.random_start] * self.n_links, # sin + [self.random_start] * self.n_links, # velocity + [True] * 2, # x-y coordinates of target distance + [False] # env steps + ]) + + @property + def start_pos(self): + return self._start_pos + + @property + def goal_pos(self): + raise ValueError("Goal position is not available and has to be learnt based on the environment.") def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] - def render(self, mode='human'): # pragma: no cover - if self.fig is None: - self.fig = plt.figure() - plt.ion() - plt.show() - else: - plt.figure(self.fig.number) - - plt.cla() - plt.title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}") - - # Arm - plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') - - # goal - goal_pos = self._goal.T - plt.plot(goal_pos[0], goal_pos[1], 'gx') - # distance between end effector and goal - plt.plot([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]], 'g--') - - lim = np.sum(self.link_lengths) + 0.5 - plt.xlim([-lim, lim]) - plt.ylim([-lim, lim]) - # plt.draw() - # plt.pause(1e-4) pushes window to foreground, which is annoying. - self.fig.canvas.flush_events() - def close(self): del self.fig @property def end_effector(self): return self._joints[self.n_links].T + + +if __name__ == '__main__': + nl = 5 + render_mode = "human" # "human" or "partial" or "final" + env = SimpleReacherEnv(n_links=nl) + obs = env.reset() + print("First", obs) + + for i in range(2000): + # objective.load_result("/tmp/cma") + # test with random actions + ac = 2 * env.action_space.sample() + # ac = np.ones(env.action_space.shape) + obs, rew, d, info = env.step(ac) + env.render(mode=render_mode) + + print(obs[env.active_obs].shape) + + if d or i % 200 == 0: + env.reset() + + env.close() diff --git a/alr_envs/classic_control/viapoint_reacher.py b/alr_envs/classic_control/viapoint_reacher.py index 127bf77..ac36360 100644 --- a/alr_envs/classic_control/viapoint_reacher.py +++ b/alr_envs/classic_control/viapoint_reacher.py @@ -1,19 +1,31 @@ +from typing import Iterable, Union + import gym import matplotlib.pyplot as plt import numpy as np +from gym.utils import seeding from alr_envs.classic_control.utils import check_self_collision +from alr_envs.utils.mps.mp_environments import MPEnv -class ViaPointReacher(gym.Env): +class ViaPointReacher(MPEnv): - def __init__(self, n_links, allow_self_collision=False, collision_penalty=1000): - self.num_links = n_links + def __init__(self, n_links, random_start: bool = True, via_target: Union[None, Iterable] = None, + target: Union[None, Iterable] = None, allow_self_collision=False, collision_penalty=1000): + + self.n_links = n_links self.link_lengths = np.ones((n_links, 1)) - # task - self.via_point = np.ones(2) - self.goal_point = np.array((n_links, 0)) + self.random_start = random_start + + # provided initial parameters + self._target = target # provided target value + self._via_target = via_target # provided via point target value + + # temp container for current env state + self._via_point = np.ones(2) + self._goal = np.array((n_links, 0)) # collision self.allow_self_collision = allow_self_collision @@ -23,78 +35,74 @@ class ViaPointReacher(gym.Env): self._joints = None self._joint_angles = None self._angle_velocity = None - self.start_pos = np.hstack([[np.pi / 2], np.zeros(self.num_links - 1)]) - self.start_vel = np.zeros(self.num_links) + self._start_pos = np.hstack([[np.pi / 2], np.zeros(self.n_links - 1)]) + self._start_vel = np.zeros(self.n_links) self.weight_matrix_scale = 1 - self._steps = 0 self.dt = 0.01 - # self.time_limit = 2 - action_bound = np.pi * np.ones((self.num_links,)) + action_bound = np.pi * np.ones((self.n_links,)) state_bound = np.hstack([ - [np.pi] * self.num_links, # cos - [np.pi] * self.num_links, # sin - [np.inf] * self.num_links, # velocity + [np.pi] * self.n_links, # cos + [np.pi] * self.n_links, # sin + [np.inf] * self.n_links, # velocity + [np.inf] * 2, # x-y coordinates of via point distance [np.inf] * 2, # x-y coordinates of target distance - [np.inf] # env steps, because reward start after n steps TODO: Maybe + [np.inf] # env steps, because reward start after n steps ]) self.action_space = gym.spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape) self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) + # containers for plotting + self.metadata = {'render.modes': ["human", "partial"]} self.fig = None - @property - def end_effector(self): - return self._joints[self.num_links].T - - def configure(self, context): - pass - - def reset(self): - self._joint_angles = self.start_pos - self._angle_velocity = self.start_vel - self._joints = np.zeros((self.num_links + 1, 2)) - self._update_joints() self._steps = 0 - - return self._get_obs().copy() + self.seed() def step(self, action: np.ndarray): """ a single step with an action in joint velocity space """ vel = action - acc = (vel - self._angle_velocity) / self.dt self._angle_velocity = vel self._joint_angles = self._joint_angles + self.dt * self._angle_velocity - self._update_joints() - dist_reward = 0 - if not self._is_collided: - if self._steps == 100: - dist_reward = np.linalg.norm(self.end_effector - self.via_point) - elif self._steps == 199: - dist_reward = np.linalg.norm(self.end_effector - self.goal_point) + acc = (vel - self._angle_velocity) / self.dt + reward, info = self._get_reward(acc) - # TODO: Do we need that? - reward = - dist_reward ** 2 - - reward -= 5e-8 * np.sum(acc**2) - - if self._is_collided: - reward -= self.collision_penalty - - info = {"is_collided": self._is_collided} + info.update({"is_collided": self._is_collided}) self._steps += 1 - - # done = self._steps * self.dt > self.time_limit or self._is_collided done = self._is_collided return self._get_obs().copy(), reward, done, info + def reset(self): + + if self.random_start: + # Maybe change more than dirst seed + first_joint = self.np_random.uniform(np.pi / 4, 3 * np.pi / 4) + self._joint_angles = np.hstack([[first_joint], np.zeros(self.n_links - 1)]) + self._start_pos = self._joint_angles.copy() + else: + self._joint_angles = self._start_pos + + self._generate_goal() + + self._angle_velocity = self._start_vel + self._joints = np.zeros((self.n_links + 1, 2)) + self._update_joints() + self._steps = 0 + + return self._get_obs().copy() + + def _generate_goal(self): + self._via_point = self.np_random.uniform(0.5, 3.5, 2) if self._via_target is None else np.copy(self._via_target) + self._goal = self.np_random.uniform(0.5, 0.1, 2) if self._target is None else np.copy(self._target) + # raise NotImplementedError("How to properly sample points??") + def _update_joints(self): """ update _joints to get new end effector position. The other links are only required for rendering. @@ -115,14 +123,38 @@ class ViaPointReacher(gym.Env): self._is_collided = self_collision + def _get_reward(self, acc): + success = False + reward = -np.inf + if not self._is_collided: + dist = np.inf + # return intermediate reward for via point + if self._steps == 100: + dist = np.linalg.norm(self.end_effector - self._via_point) + # return reward in last time step for goal + elif self._steps == 199: + dist = np.linalg.norm(self.end_effector - self._goal) + + success = dist < 0.005 + else: + # Episode terminates when colliding, hence return reward + dist = np.linalg.norm(self.end_effector - self._goal) + reward = -self.collision_penalty + + reward -= dist ** 2 + reward -= 5e-8 * np.sum(acc ** 2) + info = {"is_success": success} + + return reward, info + def _get_obs(self): theta = self._joint_angles return np.hstack([ np.cos(theta), np.sin(theta), self._angle_velocity, - self.end_effector - self.via_point, - self.end_effector - self.goal_point, + self.end_effector - self._via_point, + self.end_effector - self._goal, self._steps ]) @@ -133,7 +165,7 @@ class ViaPointReacher(gym.Env): accumulated_theta = np.cumsum(theta, axis=0) - endeffector = np.zeros(shape=(self.num_links, num_points_per_link, 2)) + endeffector = np.zeros(shape=(self.n_links, num_points_per_link, 2)) x = np.cos(accumulated_theta) * self.link_lengths * intermediate_points y = np.sin(accumulated_theta) * self.link_lengths * intermediate_points @@ -141,33 +173,46 @@ class ViaPointReacher(gym.Env): endeffector[0, :, 0] = x[0, :] endeffector[0, :, 1] = y[0, :] - for i in range(1, self.num_links): + for i in range(1, self.n_links): endeffector[i, :, 0] = x[i, :] + endeffector[i - 1, -1, 0] endeffector[i, :, 1] = y[i, :] + endeffector[i - 1, -1, 1] return np.squeeze(endeffector + self._joints[0, :]) def render(self, mode='human'): + goal_pos = self._goal.T + via_pos = self._via_point.T + if self.fig is None: + # Create base figure once on the beginning. Afterwards only update + plt.ion() self.fig = plt.figure() - # plt.ion() - # plt.pause(0.01) - else: - plt.figure(self.fig.number) + ax = self.fig.add_subplot(1, 1, 1) + + # limits + lim = np.sum(self.link_lengths) + 0.5 + ax.set_xlim([-lim, lim]) + ax.set_ylim([-lim, lim]) + + self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') + self.goal_point_plot, = ax.plot(goal_pos[0], goal_pos[1], 'go') + self.via_point_plot, = ax.plot(via_pos[0], via_pos[1], 'gx') + + self.fig.show() + + self.fig.gca().set_title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}") if mode == "human": - plt.cla() - plt.title(f"Iteration: {self._steps}") + # goal + if self._steps == 1: + self.goal_point_plot.set_data(goal_pos[0], goal_pos[1]) + self.via_point_plot.set_data(via_pos[0], goal_pos[1]) - # Arm - plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') + # arm + self.line.set_data(self._joints[:, 0], self._joints[:, 1]) - lim = np.sum(self.link_lengths) + 0.5 - plt.xlim([-lim, lim]) - plt.ylim([-lim, lim]) - # plt.draw() - plt.pause(1e-4) # pushes window to foreground, which is annoying. - # self.fig.canvas.flush_events() + self.fig.canvas.draw() + self.fig.canvas.flush_events() elif mode == "partial": if self._steps == 1: @@ -196,12 +241,39 @@ class ViaPointReacher(gym.Env): # Add the patch to the Axes [plt.gca().add_patch(rect) for rect in self.patches] - plt.xlim(-self.num_links, self.num_links), plt.ylim(-1, self.num_links) + plt.xlim(-self.n_links, self.n_links), plt.ylim(-1, self.n_links) # Arm plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') plt.pause(0.01) + @property + def active_obs(self): + return np.hstack([ + [self.random_start] * self.n_links, # cos + [self.random_start] * self.n_links, # sin + [self.random_start] * self.n_links, # velocity + [self._via_target is None] * 2, # x-y coordinates of via point distance + [True] * 2, # x-y coordinates of target distance + [False] # env steps + ]) + + @property + def start_pos(self) -> Union[float, int, np.ndarray]: + return self._start_pos + + @property + def goal_pos(self) -> Union[float, int, np.ndarray]: + raise ValueError("Goal position is not available and has to be learnt based on the environment.") + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + @property + def end_effector(self): + return self._joints[self.n_links].T + def close(self): if self.fig is not None: plt.close(self.fig) diff --git a/alr_envs/mujoco/ball_in_a_cup/utils.py b/alr_envs/mujoco/ball_in_a_cup/utils.py index d368e45..714566a 100644 --- a/alr_envs/mujoco/ball_in_a_cup/utils.py +++ b/alr_envs/mujoco/ball_in_a_cup/utils.py @@ -17,19 +17,8 @@ def make_contextual_env(rank, seed=0): def _init(): env = ALRBallInACupEnv(reward_type="contextual_goal") - env = DetPMPWrapper(env, - num_dof=7, - num_basis=5, - width=0.005, - policy_type="motor", - start_pos=env.start_pos, - duration=3.5, - post_traj_time=4.5, - dt=env.dt, - weights_scale=0.5, - zero_start=True, - zero_goal=True - ) + env = DetPMPWrapper(env, num_dof=7, num_basis=5, width=0.005, duration=3.5, dt=env.dt, post_traj_time=4.5, + policy_type="motor", weights_scale=0.5, zero_start=True, zero_goal=True) env.seed(seed + rank) return env @@ -51,19 +40,8 @@ def make_env(rank, seed=0): def _init(): env = ALRBallInACupEnv(reward_type="simple") - env = DetPMPWrapper(env, - num_dof=7, - num_basis=5, - width=0.005, - policy_type="motor", - start_pos=env.start_pos, - duration=3.5, - post_traj_time=4.5, - dt=env.dt, - weights_scale=0.2, - zero_start=True, - zero_goal=True - ) + env = DetPMPWrapper(env, num_dof=7, num_basis=5, width=0.005, duration=3.5, dt=env.dt, post_traj_time=4.5, + policy_type="motor", weights_scale=0.2, zero_start=True, zero_goal=True) env.seed(seed + rank) return env @@ -85,20 +63,8 @@ def make_simple_env(rank, seed=0): def _init(): env = ALRBallInACupEnv(reward_type="simple") - env = DetPMPWrapper(env, - num_dof=3, - num_basis=5, - width=0.005, - off=-0.1, - policy_type="motor", - start_pos=env.start_pos[1::2], - duration=3.5, - post_traj_time=4.5, - dt=env.dt, - weights_scale=0.25, - zero_start=True, - zero_goal=True - ) + env = DetPMPWrapper(env, num_dof=3, num_basis=5, width=0.005, duration=3.5, dt=env.dt, post_traj_time=4.5, + policy_type="motor", weights_scale=0.25, zero_start=True, zero_goal=True, off=-0.1) env.seed(seed + rank) return env diff --git a/alr_envs/mujoco/beerpong/utils.py b/alr_envs/mujoco/beerpong/utils.py index cdcbd13..37d2ad1 100644 --- a/alr_envs/mujoco/beerpong/utils.py +++ b/alr_envs/mujoco/beerpong/utils.py @@ -17,19 +17,8 @@ def make_contextual_env(rank, seed=0): def _init(): env = ALRBeerpongEnv() - env = DetPMPWrapper(env, - num_dof=7, - num_basis=5, - width=0.005, - policy_type="motor", - start_pos=env.start_pos, - duration=3.5, - post_traj_time=4.5, - dt=env.dt, - weights_scale=0.5, - zero_start=True, - zero_goal=True - ) + env = DetPMPWrapper(env, num_dof=7, num_basis=5, width=0.005, duration=3.5, dt=env.dt, post_traj_time=4.5, + policy_type="motor", weights_scale=0.5, zero_start=True, zero_goal=True) env.seed(seed + rank) return env @@ -51,19 +40,8 @@ def make_env(rank, seed=0): def _init(): env = ALRBeerpongEnvSimple() - env = DetPMPWrapper(env, - num_dof=7, - num_basis=5, - width=0.005, - policy_type="motor", - start_pos=env.start_pos, - duration=3.5, - post_traj_time=4.5, - dt=env.dt, - weights_scale=0.25, - zero_start=True, - zero_goal=True - ) + env = DetPMPWrapper(env, num_dof=7, num_basis=5, width=0.005, duration=3.5, dt=env.dt, post_traj_time=4.5, + policy_type="motor", weights_scale=0.25, zero_start=True, zero_goal=True) env.seed(seed + rank) return env @@ -85,19 +63,8 @@ def make_simple_env(rank, seed=0): def _init(): env = ALRBeerpongEnvSimple() - env = DetPMPWrapper(env, - num_dof=3, - num_basis=5, - width=0.005, - policy_type="motor", - start_pos=env.start_pos[1::2], - duration=3.5, - post_traj_time=4.5, - dt=env.dt, - weights_scale=0.5, - zero_start=True, - zero_goal=True - ) + env = DetPMPWrapper(env, num_dof=3, num_basis=5, width=0.005, duration=3.5, dt=env.dt, post_traj_time=4.5, + policy_type="motor", weights_scale=0.5, zero_start=True, zero_goal=True) env.seed(seed + rank) return env diff --git a/alr_envs/utils/legacy/utils.py b/alr_envs/utils/legacy/utils.py index fbcb34a..e96eee6 100644 --- a/alr_envs/utils/legacy/utils.py +++ b/alr_envs/utils/legacy/utils.py @@ -49,13 +49,13 @@ def make_holereacher_env(rank, seed=0): """ def _init(): - _env = hr.HoleReacher(n_links=5, - allow_self_collision=False, - allow_wall_collision=False, - hole_width=0.25, - hole_depth=1, - hole_x=2, - collision_penalty=100) + _env = hr.HoleReacherEnv(n_links=5, + allow_self_collision=False, + allow_wall_collision=False, + hole_width=0.25, + hole_depth=1, + hole_x=2, + collision_penalty=100) _env = DmpWrapper(_env, num_dof=5, @@ -89,13 +89,13 @@ def make_holereacher_fix_goal_env(rank, seed=0): """ def _init(): - _env = hr.HoleReacher(n_links=5, - allow_self_collision=False, - allow_wall_collision=False, - hole_width=0.15, - hole_depth=1, - hole_x=1, - collision_penalty=100) + _env = hr.HoleReacherEnv(n_links=5, + allow_self_collision=False, + allow_wall_collision=False, + hole_width=0.15, + hole_depth=1, + hole_x=1, + collision_penalty=100) _env = DmpWrapper(_env, num_dof=5, @@ -129,27 +129,16 @@ def make_holereacher_env_pmp(rank, seed=0): """ def _init(): - _env = hr.HoleReacher(n_links=5, - allow_self_collision=False, - allow_wall_collision=False, - hole_width=0.15, - hole_depth=1, - hole_x=1, - collision_penalty=1000) + _env = hr.HoleReacherEnv(n_links=5, + allow_self_collision=False, + allow_wall_collision=False, + hole_width=0.15, + hole_depth=1, + hole_x=1, + collision_penalty=1000) - _env = DetPMPWrapper(_env, - num_dof=5, - num_basis=5, - width=0.02, - policy_type="velocity", - start_pos=_env._start_pos, - duration=2, - post_traj_time=0, - dt=_env.dt, - weights_scale=0.2, - zero_start=True, - zero_goal=False - ) + _env = DetPMPWrapper(_env, num_dof=5, num_basis=5, width=0.02, duration=2, dt=_env.dt, post_traj_time=0, + policy_type="velocity", weights_scale=0.2, zero_start=True, zero_goal=False) _env.seed(seed + rank) return _env diff --git a/alr_envs/utils/mps/detpmp_wrapper.py b/alr_envs/utils/mps/detpmp_wrapper.py index 63de98b..3f6d1ee 100644 --- a/alr_envs/utils/mps/detpmp_wrapper.py +++ b/alr_envs/utils/mps/detpmp_wrapper.py @@ -7,22 +7,22 @@ from alr_envs.utils.mps.mp_wrapper import MPWrapper class DetPMPWrapper(MPWrapper): - def __init__(self, env: MPEnv, num_dof: int, num_basis: int, width: int, start_pos=None, duration: int = 1, - dt: float = 0.01, post_traj_time: float = 0., policy_type: str = None, weights_scale: float = 1., + def __init__(self, env: MPEnv, num_dof: int, num_basis: int, width: int, duration: int = 1, dt: float = 0.01, + post_traj_time: float = 0., policy_type: str = None, weights_scale: float = 1., zero_start: bool = False, zero_goal: bool = False, **mp_kwargs): - # self.duration = duration # seconds + self.duration = duration # seconds super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, num_basis=num_basis, - width=width, start_pos=start_pos, zero_start=zero_start, zero_goal=zero_goal, **mp_kwargs) + width=width, zero_start=zero_start, zero_goal=zero_goal, **mp_kwargs) + + self.dt = dt action_bounds = np.inf * np.ones((self.mp.n_basis * self.mp.n_dof)) self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) - self.start_pos = start_pos - self.dt = dt def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, width: float = None, - start_pos: np.ndarray = None, zero_start: bool = False, zero_goal: bool = False): + zero_start: bool = False, zero_goal: bool = False): pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=0.01, zero_start=zero_start, zero_goal=zero_goal) diff --git a/alr_envs/utils/mps/dmp_wrapper.py b/alr_envs/utils/mps/dmp_wrapper.py index e42205e..6bc5bc6 100644 --- a/alr_envs/utils/mps/dmp_wrapper.py +++ b/alr_envs/utils/mps/dmp_wrapper.py @@ -63,7 +63,7 @@ class DmpWrapper(MPWrapper): goal_pos = params[0, -self.mp.num_dimensions:] # [num_dof] params = params[:, :-self.mp.num_dimensions] # [1,num_dof] else: - goal_pos = self.env.goal_pos # self.mp.dmp_goal_pos.flatten() + goal_pos = self.env.goal_pos assert goal_pos is not None weight_matrix = np.reshape(params, self.mp.dmp_weights.shape) # [num_basis, num_dof] diff --git a/alr_envs/utils/mps/mp_environments.py b/alr_envs/utils/mps/mp_environments.py index f720f2f..948e4ba 100644 --- a/alr_envs/utils/mps/mp_environments.py +++ b/alr_envs/utils/mps/mp_environments.py @@ -9,7 +9,7 @@ class MPEnv(gym.Env): @property @abstractmethod - def corrected_obs_index(self): + def active_obs(self): """Returns boolean value for each observation entry whether the observation is returned by the DMP for the contextual case or not. This effectively allows to filter unwanted or unnecessary observations from the full step-based case. diff --git a/alr_envs/utils/mps/mp_wrapper.py b/alr_envs/utils/mps/mp_wrapper.py index 621de00..4c92806 100644 --- a/alr_envs/utils/mps/mp_wrapper.py +++ b/alr_envs/utils/mps/mp_wrapper.py @@ -13,6 +13,12 @@ class MPWrapper(gym.Wrapper, ABC): policy_type: str = None, weights_scale: float = 1., render_mode: str = None, **mp_kwargs): super().__init__(env) + # adjust observation space to reduce version + obs_sp = self.env.observation_space + self.observation_space = gym.spaces.Box(low=obs_sp.low[self.env.active_obs], + high=obs_sp.high[self.env.active_obs], + dtype=obs_sp.dtype) + assert dt is not None # this should never happen as MPWrapper is a base class self.post_traj_steps = int(post_traj_time / dt) @@ -51,8 +57,7 @@ class MPWrapper(gym.Wrapper, ABC): self.env.configure(context) def reset(self): - obs = self.env.reset() - return obs[self.env] + return self.env.reset()[self.env.active_obs] def step(self, action: np.ndarray): """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" @@ -82,7 +87,7 @@ class MPWrapper(gym.Wrapper, ABC): break done = True - return obs, rewards, done, info + return obs[self.env.active_obs], rewards, done, info def render(self, mode='human', **kwargs): """Only set render options here, such that they can be used during the rollout. diff --git a/example.py b/example.py index 2d32ad8..166f38f 100644 --- a/example.py +++ b/example.py @@ -46,7 +46,7 @@ def example_dmp(): obs = env.reset() -def example_async(n_cpu=4, seed=int('533D', 16)): +def example_async(env_id="alr_envs:HoleReacherDMP-v0", n_cpu=4, seed=int('533D', 16)): def make_env(env_id, seed, rank): env = gym.make(env_id) env.seed(seed + rank) @@ -73,7 +73,7 @@ def example_async(n_cpu=4, seed=int('533D', 16)): # do not return values above threshold return (*map(lambda v: np.stack(v)[:n_samples], vals.values()),) - envs = gym.vector.AsyncVectorEnv([make_env("alr_envs:HoleReacherDMP-v0", seed, i) for i in range(n_cpu)]) + envs = gym.vector.AsyncVectorEnv([make_env(env_id, seed, i) for i in range(n_cpu)]) obs = envs.reset() print(sample(envs, 16)) @@ -82,7 +82,6 @@ def example_async(n_cpu=4, seed=int('533D', 16)): if __name__ == '__main__': # example_mujoco() # example_dmp() - # example_async() + example_async("alr_envs:LongSimpleReacherDMP-v0", 4) # env = gym.make("alr_envs:HoleReacherDMP-v0", context=0.1) - env = gym.make("alr_envs:SimpleReacherDMP-v1") - print() \ No newline at end of file + # env = gym.make("alr_envs:HoleReacherDMP-v1") From 7f512068c96c3e075b720e9a3a193028d9c5e211 Mon Sep 17 00:00:00 2001 From: Maximilian Huettenrauch Date: Mon, 17 May 2021 09:32:51 +0200 Subject: [PATCH 05/11] context wip --- .../episodic_simple_reacher.py | 2 +- alr_envs/classic_control/hole_reacher.py | 11 +++++++++-- alr_envs/classic_control/simple_reacher.py | 19 +++++++++++-------- alr_envs/utils/mp_env_async_sampler.py | 9 ++++++--- alr_envs/utils/wrapper/dmp_wrapper.py | 2 +- alr_envs/utils/wrapper/mp_wrapper.py | 3 ++- example.py | 4 ++-- 7 files changed, 32 insertions(+), 18 deletions(-) diff --git a/alr_envs/classic_control/episodic_simple_reacher.py b/alr_envs/classic_control/episodic_simple_reacher.py index b02efe8..f6d828e 100644 --- a/alr_envs/classic_control/episodic_simple_reacher.py +++ b/alr_envs/classic_control/episodic_simple_reacher.py @@ -26,7 +26,7 @@ class EpisodicSimpleReacherEnv(SimpleReacherEnv): self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) @property - def start_pos(self): + def init_qpos(self): return self._start_pos # @property diff --git a/alr_envs/classic_control/hole_reacher.py b/alr_envs/classic_control/hole_reacher.py index 3b382f9..3ddc360 100644 --- a/alr_envs/classic_control/hole_reacher.py +++ b/alr_envs/classic_control/hole_reacher.py @@ -62,6 +62,10 @@ class HoleReacher(gym.Env): self.patches = [rect_1, rect_2, rect_3] + @property + def init_qpos(self): + return self.start_pos + @property def end_effector(self): return self._joints[self.n_links].T @@ -82,7 +86,7 @@ class HoleReacher(gym.Env): """ a single step with an action in joint velocity space """ - vel = action # + 0.01 * np.random.randn(self.num_links) + vel = action # + 0.05 * np.random.randn(self.n_links) acc = (vel - self._angle_velocity) / self.dt self._angle_velocity = vel self._joint_angles = self._joint_angles + self.dt * self._angle_velocity @@ -237,7 +241,10 @@ class HoleReacher(gym.Env): if self._steps == 1: # fig, ax = plt.subplots() # Add the patch to the Axes - [plt.gca().add_patch(rect) for rect in self.patches] + try: + [plt.gca().add_patch(rect) for rect in self.patches] + except RuntimeError: + pass # plt.pause(0.01) if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided: diff --git a/alr_envs/classic_control/simple_reacher.py b/alr_envs/classic_control/simple_reacher.py index 7ca4ead..3f4a2a9 100644 --- a/alr_envs/classic_control/simple_reacher.py +++ b/alr_envs/classic_control/simple_reacher.py @@ -22,7 +22,7 @@ class SimpleReacherEnv(gym.Env): super().__init__() self.link_lengths = np.ones(n_links) self.n_links = n_links - self.dt = 0.1 + self.dt = 0.01 self.random_start = random_start @@ -56,10 +56,13 @@ class SimpleReacherEnv(gym.Env): def step(self, action: np.ndarray): # action = self._add_action_noise(action) - action = np.clip(action, -self.max_torque, self.max_torque) + # action = np.clip(action, -self.max_torque, self.max_torque) + vel = action - self._angle_velocity = self._angle_velocity + self.dt * action - self._joint_angle = angle_normalize(self._joint_angle + self.dt * self._angle_velocity) + # self._angle_velocity = self._angle_velocity + self.dt * action + # self._joint_angle = angle_normalize(self._joint_angle + self.dt * self._angle_velocity) + self._angle_velocity = vel + self._joint_angle = self._joint_angle + self.dt * self._angle_velocity self._update_joints() self._steps += 1 @@ -111,7 +114,7 @@ class SimpleReacherEnv(gym.Env): # reward_dist = np.exp(-0.1 * diff ** 2).mean() # reward_dist = - (diff ** 2).mean() - reward_ctrl = (action ** 2).sum() + reward_ctrl = 1e-5 * (action ** 2).sum() reward = reward_dist - reward_ctrl return reward, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl) @@ -139,7 +142,7 @@ class SimpleReacherEnv(gym.Env): # Sample uniformly in circle with radius R around center of reacher. R = np.sum(self.link_lengths) r = R * np.sqrt(self.np_random.uniform()) - theta = self.np_random.uniform() * 2 * np.pi + theta = np.pi/2 + 0.001 * np.random.randn() # self.np_random.uniform() * 2 * np.pi return center + r * np.stack([np.cos(theta), np.sin(theta)]) def seed(self, seed=None): @@ -170,8 +173,8 @@ class SimpleReacherEnv(gym.Env): plt.xlim([-lim, lim]) plt.ylim([-lim, lim]) # plt.draw() - # plt.pause(1e-4) pushes window to foreground, which is annoying. - self.fig.canvas.flush_events() + plt.pause(1e-4) # pushes window to foreground, which is annoying. + # self.fig.canvas.flush_events() def close(self): del self.fig diff --git a/alr_envs/utils/mp_env_async_sampler.py b/alr_envs/utils/mp_env_async_sampler.py index 59cf594..2fb3645 100644 --- a/alr_envs/utils/mp_env_async_sampler.py +++ b/alr_envs/utils/mp_env_async_sampler.py @@ -56,6 +56,7 @@ class AlrMpEnvSampler: vals = defaultdict(list) for p in split_params: + self.env.reset() obs, reward, done, info = self.env.step(p) vals['obs'].append(obs) vals['reward'].append(reward) @@ -82,8 +83,9 @@ class AlrContextualMpEnvSampler: vals = defaultdict(list) for i in range(repeat): new_contexts = self.env.reset() - - new_samples = dist.sample(new_contexts) + vals['new_contexts'].append(new_contexts) + new_samples, new_contexts = dist.sample(new_contexts) + vals['new_samples'].append(new_samples) obs, reward, done, info = self.env.step(new_samples) vals['obs'].append(obs) @@ -92,7 +94,8 @@ class AlrContextualMpEnvSampler: vals['info'].append(info) # do not return values above threshold - return np.vstack(vals['obs'])[:n_samples], np.hstack(vals['reward'])[:n_samples],\ + return np.vstack(vals['new_samples'])[:n_samples], np.vstack(vals['new_contexts'])[:n_samples], \ + np.vstack(vals['obs'])[:n_samples], np.hstack(vals['reward'])[:n_samples], \ _flatten_list(vals['done'])[:n_samples], _flatten_list(vals['info'])[:n_samples] diff --git a/alr_envs/utils/wrapper/dmp_wrapper.py b/alr_envs/utils/wrapper/dmp_wrapper.py index 2a198db..36a8c92 100644 --- a/alr_envs/utils/wrapper/dmp_wrapper.py +++ b/alr_envs/utils/wrapper/dmp_wrapper.py @@ -98,7 +98,7 @@ class DmpWrapper(MPWrapper): def mp_rollout(self, action): # if self.mp.start_pos is None: - self.mp.dmp_start_pos = self.env.init_qpos # start_pos + self.mp.dmp_start_pos = self.env.init_qpos.reshape((1, self.num_dof)) # start_pos goal_pos, weight_matrix = self.goal_and_weights(action) self.mp.set_weights(weight_matrix, goal_pos) return self.mp.reference_trajectory(self.t) diff --git a/alr_envs/utils/wrapper/mp_wrapper.py b/alr_envs/utils/wrapper/mp_wrapper.py index adeba55..43f127c 100644 --- a/alr_envs/utils/wrapper/mp_wrapper.py +++ b/alr_envs/utils/wrapper/mp_wrapper.py @@ -22,7 +22,7 @@ class MPWrapper(gym.Wrapper, ABC): ): super().__init__(env) - # self.num_dof = num_dof + self.num_dof = num_dof # self.num_basis = num_basis # self.duration = duration # seconds @@ -50,6 +50,7 @@ class MPWrapper(gym.Wrapper, ABC): # for p, c in zip(params, contexts): for p in params: # self.configure(c) + # context = self.reset() ob, reward, done, info = self.step(p) obs.append(ob) rewards.append(reward) diff --git a/example.py b/example.py index 2d32ad8..1718e46 100644 --- a/example.py +++ b/example.py @@ -83,6 +83,6 @@ if __name__ == '__main__': # example_mujoco() # example_dmp() # example_async() - # env = gym.make("alr_envs:HoleReacherDMP-v0", context=0.1) - env = gym.make("alr_envs:SimpleReacherDMP-v1") + env = gym.make("alr_envs:HoleReacherDMP-v0") + # env = gym.make("alr_envs:SimpleReacherDMP-v1") print() \ No newline at end of file From 14c60766c24ce5170a9c267253059e5896850346 Mon Sep 17 00:00:00 2001 From: ottofabian Date: Mon, 17 May 2021 17:58:33 +0200 Subject: [PATCH 06/11] fixed open issues --- alr_envs/__init__.py | 96 +++++++------------- alr_envs/classic_control/simple_reacher.py | 6 -- alr_envs/classic_control/viapoint_reacher.py | 25 ++++- alr_envs/utils/mp_env_async_sampler.py | 2 +- 4 files changed, 56 insertions(+), 73 deletions(-) diff --git a/alr_envs/__init__.py b/alr_envs/__init__.py index 17d5541..0007982 100644 --- a/alr_envs/__init__.py +++ b/alr_envs/__init__.py @@ -214,16 +214,17 @@ register( ) # MP environments -reacher_envs = ["SimpleReacher-v0", "SimpleReacher-v1", "LongSimpleReacher-v0", "LongSimpleReacher-v1"] -for env in reacher_envs: - name = env.split("-") +## Simple Reacher +versions = ["SimpleReacher-v0", "SimpleReacher-v1", "LongSimpleReacher-v0", "LongSimpleReacher-v1"] +for v in versions: + name = v.split("-") register( id=f'{name[0]}DMP-{name[1]}', entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', # max_episode_steps=1, kwargs={ - "name": f"alr_envs:{env}", - "num_dof": 2 if "long" not in env.lower() else 5 , + "name": f"alr_envs:{v}", + "num_dof": 2 if "long" not in v.lower() else 5, "num_basis": 5, "duration": 2, "alpha_phase": 2, @@ -249,59 +250,33 @@ register( } ) -register( - id='HoleReacherDMP-v0', - entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', - # max_episode_steps=1, - kwargs={ - "name": "alr_envs:HoleReacher-v0", - "num_dof": 5, - "num_basis": 5, - "duration": 2, - "learn_goal": True, - "alpha_phase": 2, - "bandwidth_factor": 2, - "policy_type": "velocity", - "weights_scale": 50, - "goal_scale": 0.1 - } -) +## Hole Reacher +versions = ["v0", "v1", "v2"] +for v in versions: + register( + id=f'HoleReacherDMP-{v}', + entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', + # max_episode_steps=1, + kwargs={ + "name": f"alr_envs:HoleReacher-{v}", + "num_dof": 5, + "num_basis": 5, + "duration": 2, + "learn_goal": True, + "alpha_phase": 2, + "bandwidth_factor": 2, + "policy_type": "velocity", + "weights_scale": 50, + "goal_scale": 0.1 + } + ) -register( - id='HoleReacherDMP-v1', - entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', - # max_episode_steps=1, - kwargs={ - "name": "alr_envs:HoleReacher-v1", - "num_dof": 5, - "num_basis": 5, - "duration": 2, - "learn_goal": True, - "alpha_phase": 2, - "bandwidth_factor": 2, - "policy_type": "velocity", - "weights_scale": 50, - "goal_scale": 0.1 - } -) - -register( - id='HoleReacherDMP-v2', - entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', - # max_episode_steps=1, - kwargs={ - "name": "alr_envs:HoleReacher-v2", - "num_dof": 5, - "num_basis": 5, - "duration": 2, - "learn_goal": True, - "alpha_phase": 2, - "bandwidth_factor": 2, - "policy_type": "velocity", - "weights_scale": 50, - "goal_scale": 0.1 - } -) +# register( +# id='HoleReacherDetPMP-v0', +# entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp', +# # max_episode_steps=1, +# # TODO: add mp kwargs +# ) # TODO: properly add final_pos register( @@ -321,12 +296,7 @@ register( } ) -# register( -# id='HoleReacherDetPMP-v0', -# entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp', -# # max_episode_steps=1, -# # TODO: add mp kwargs -# ) +## Ball in Cup register( id='ALRBallInACupSimpleDMP-v0', diff --git a/alr_envs/classic_control/simple_reacher.py b/alr_envs/classic_control/simple_reacher.py index 4e99ff1..425134d 100644 --- a/alr_envs/classic_control/simple_reacher.py +++ b/alr_envs/classic_control/simple_reacher.py @@ -127,12 +127,6 @@ class SimpleReacherEnv(MPEnv): def _generate_goal(self): if self._target is None: - # center = self._joints[0] - # # Sample uniformly in circle with radius R around center of reacher. - # R = np.sum(self.link_lengths) - # r = R * np.sqrt(self.np_random.uniform()) - # theta = self.np_random.uniform() * 2 * np.pi - # goal = center + r * np.stack([np.cos(theta), np.sin(theta)]) total_length = np.sum(self.link_lengths) goal = np.array([total_length, total_length]) diff --git a/alr_envs/classic_control/viapoint_reacher.py b/alr_envs/classic_control/viapoint_reacher.py index ac36360..2897f31 100644 --- a/alr_envs/classic_control/viapoint_reacher.py +++ b/alr_envs/classic_control/viapoint_reacher.py @@ -99,9 +99,28 @@ class ViaPointReacher(MPEnv): return self._get_obs().copy() def _generate_goal(self): - self._via_point = self.np_random.uniform(0.5, 3.5, 2) if self._via_target is None else np.copy(self._via_target) - self._goal = self.np_random.uniform(0.5, 0.1, 2) if self._target is None else np.copy(self._target) - # raise NotImplementedError("How to properly sample points??") + # TODO: Maybe improve this later, this can yield quite a lot of invalid settings + + total_length = np.sum(self.link_lengths) + + # rejection sampled point in inner circle with 0.5*Radius + if self._via_target is None: + via_target = np.array([total_length, total_length]) + while np.linalg.norm(via_target) >= 0.5 * total_length: + via_target = self.np_random.uniform(low=-0.5 * total_length, high=0.5 * total_length, size=2) + else: + via_target = np.copy(self._via_target) + + # rejection sampled point in outer circle + if self._target is None: + goal = np.array([total_length, total_length]) + while np.linalg.norm(goal) >= total_length or np.linalg.norm(goal) <= 0.5 * total_length: + goal = self.np_random.uniform(low=-total_length, high=total_length, size=2) + else: + goal = np.copy(self._target) + + self._via_target = via_target + self._goal = goal def _update_joints(self): """ diff --git a/alr_envs/utils/mp_env_async_sampler.py b/alr_envs/utils/mp_env_async_sampler.py index 2fb3645..e935ba6 100644 --- a/alr_envs/utils/mp_env_async_sampler.py +++ b/alr_envs/utils/mp_env_async_sampler.py @@ -70,7 +70,7 @@ class AlrMpEnvSampler: class AlrContextualMpEnvSampler: """ - An asynchronous sampler for non contextual MPWrapper environments. A sampler object can be called with a set of + An asynchronous sampler for contextual MPWrapper environments. A sampler object can be called with a set of parameters and returns the corresponding final obs, rewards, dones and info dicts. """ def __init__(self, env_id, num_envs, seed=0, **env_kwargs): From 528b7521b63ce8493b0fdc10d2056eb7f32b51a3 Mon Sep 17 00:00:00 2001 From: ottofabian Date: Mon, 17 May 2021 17:59:28 +0200 Subject: [PATCH 07/11] removed legacy wrappers --- alr_envs/utils/legacy/__init__.py | 0 alr_envs/utils/legacy/detpmp_env_wrapper.py | 88 --------- alr_envs/utils/legacy/dmp_async_vec_env.py | 173 ------------------ alr_envs/utils/legacy/dmp_env_wrapper.py | 125 ------------- .../utils/legacy/dmp_env_wrapper_example.py | 28 --- .../utils/legacy/dmp_pd_control_example.py | 28 --- alr_envs/utils/legacy/utils.py | 145 --------------- 7 files changed, 587 deletions(-) delete mode 100644 alr_envs/utils/legacy/__init__.py delete mode 100644 alr_envs/utils/legacy/detpmp_env_wrapper.py delete mode 100644 alr_envs/utils/legacy/dmp_async_vec_env.py delete mode 100644 alr_envs/utils/legacy/dmp_env_wrapper.py delete mode 100644 alr_envs/utils/legacy/dmp_env_wrapper_example.py delete mode 100644 alr_envs/utils/legacy/dmp_pd_control_example.py delete mode 100644 alr_envs/utils/legacy/utils.py diff --git a/alr_envs/utils/legacy/__init__.py b/alr_envs/utils/legacy/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/alr_envs/utils/legacy/detpmp_env_wrapper.py b/alr_envs/utils/legacy/detpmp_env_wrapper.py deleted file mode 100644 index c667abf..0000000 --- a/alr_envs/utils/legacy/detpmp_env_wrapper.py +++ /dev/null @@ -1,88 +0,0 @@ -from alr_envs.utils.policies import get_policy_class -from mp_lib import det_promp -import numpy as np -import gym - - -class DetPMPEnvWrapper(gym.Wrapper): - def __init__(self, - env, - num_dof, - num_basis, - width, - off=0.01, - start_pos=None, - duration=1, - dt=0.01, - post_traj_time=0., - policy_type=None, - weights_scale=1, - zero_start=False, - zero_goal=False, - ): - super(DetPMPEnvWrapper, self).__init__(env) - self.num_dof = num_dof - self.num_basis = num_basis - self.dim = num_dof * num_basis - self.pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=off, - zero_start=zero_start, zero_goal=zero_goal) - weights = np.zeros(shape=(num_basis, num_dof)) - self.pmp.set_weights(duration, weights) - self.weights_scale = weights_scale - - self.duration = duration - self.dt = dt - self.post_traj_steps = int(post_traj_time / dt) - - self.start_pos = start_pos - self.zero_start = zero_start - - policy_class = get_policy_class(policy_type) - self.policy = policy_class(env) - - def __call__(self, params, contexts=None): - params = np.atleast_2d(params) - rewards = [] - infos = [] - for p, c in zip(params, contexts): - reward, info = self.rollout(p, c) - rewards.append(reward) - infos.append(info) - - return np.array(rewards), infos - - def rollout(self, params, context=None, render=False): - """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" - params = np.reshape(params, newshape=(self.num_basis, self.num_dof)) * self.weights_scale - self.pmp.set_weights(self.duration, params) - t, des_pos, des_vel, des_acc = self.pmp.compute_trajectory(1 / self.dt, 1.) - if self.zero_start: - des_pos += self.start_pos[None, :] - - if self.post_traj_steps > 0: - des_pos = np.vstack([des_pos, np.tile(des_pos[-1, :], [self.post_traj_steps, 1])]) - des_vel = np.vstack([des_vel, np.zeros(shape=(self.post_traj_steps, self.num_dof))]) - - self._trajectory = des_pos - self._velocity = des_vel - - rews = [] - infos = [] - - self.env.configure(context) - self.env.reset() - - for t, pos_vel in enumerate(zip(des_pos, des_vel)): - ac = self.policy.get_action(pos_vel[0], pos_vel[1]) - obs, rew, done, info = self.env.step(ac) - rews.append(rew) - infos.append(info) - if render: - self.env.render(mode="human") - if done: - break - - reward = np.sum(rews) - - return reward, info - diff --git a/alr_envs/utils/legacy/dmp_async_vec_env.py b/alr_envs/utils/legacy/dmp_async_vec_env.py deleted file mode 100644 index 641e770..0000000 --- a/alr_envs/utils/legacy/dmp_async_vec_env.py +++ /dev/null @@ -1,173 +0,0 @@ -import gym -from gym.error import (AlreadyPendingCallError, NoAsyncCallError) -from gym.vector.utils import concatenate, create_empty_array -from gym.vector.async_vector_env import AsyncState -import numpy as np -import multiprocessing as mp -import sys - - -def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue): - assert shared_memory is None - env = env_fn() - parent_pipe.close() - try: - while True: - command, data = pipe.recv() - if command == 'reset': - observation = env.reset() - pipe.send((observation, True)) - elif command == 'step': - observation, reward, done, info = env.step(data) - if done: - observation = env.reset() - pipe.send(((observation, reward, done, info), True)) - elif command == 'rollout': - rewards = [] - infos = [] - for p, c in zip(*data): - reward, info = env.rollout(p, c) - rewards.append(reward) - infos.append(info) - pipe.send(((rewards, infos), (True, ) * len(rewards))) - elif command == 'seed': - env.seed(data) - pipe.send((None, True)) - elif command == 'close': - env.close() - pipe.send((None, True)) - break - elif command == 'idle': - pipe.send((None, True)) - elif command == '_check_observation_space': - pipe.send((data == env.observation_space, True)) - else: - raise RuntimeError('Received unknown command `{0}`. Must ' - 'be one of {`reset`, `step`, `seed`, `close`, ' - '`_check_observation_space`}.'.format(command)) - except (KeyboardInterrupt, Exception): - error_queue.put((index,) + sys.exc_info()[:2]) - pipe.send((None, False)) - finally: - env.close() - - -class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv): - def __init__(self, env_fns, n_samples, observation_space=None, action_space=None, - shared_memory=False, copy=True, context="spawn", daemon=True, worker=_worker): - super(DmpAsyncVectorEnv, self).__init__(env_fns, - observation_space=observation_space, - action_space=action_space, - shared_memory=shared_memory, - copy=copy, - context=context, - daemon=daemon, - worker=worker) - - # we need to overwrite the number of samples as we may sample more than num_envs - self.observations = create_empty_array(self.single_observation_space, - n=n_samples, - fn=np.zeros) - - def __call__(self, params, contexts=None): - return self.rollout(params, contexts) - - def rollout_async(self, params, contexts): - """ - Parameters - ---------- - params : iterable of samples from `action_space` - List of actions. - """ - self._assert_is_running() - if self._state != AsyncState.DEFAULT: - raise AlreadyPendingCallError('Calling `rollout_async` while waiting ' - 'for a pending call to `{0}` to complete.'.format( - self._state.value), self._state.value) - - params = np.atleast_2d(params) - split_params = np.array_split(params, np.minimum(len(params), self.num_envs)) - if contexts is None: - split_contexts = np.array_split([None, ] * len(params), np.minimum(len(params), self.num_envs)) - else: - split_contexts = np.array_split(contexts, np.minimum(len(contexts), self.num_envs)) - - assert np.all([len(p) == len(c) for p, c in zip(split_params, split_contexts)]) - for pipe, param, context in zip(self.parent_pipes, split_params, split_contexts): - pipe.send(('rollout', (param, context))) - for pipe in self.parent_pipes[len(split_params):]: - pipe.send(('idle', None)) - self._state = AsyncState.WAITING_ROLLOUT - - def rollout_wait(self, timeout=None): - """ - Parameters - ---------- - timeout : int or float, optional - Number of seconds before the call to `step_wait` times out. If - `None`, the call to `step_wait` never times out. - - Returns - ------- - observations : sample from `observation_space` - A batch of observations from the vectorized environment. - - rewards : `np.ndarray` instance (dtype `np.float_`) - A vector of rewards from the vectorized environment. - - dones : `np.ndarray` instance (dtype `np.bool_`) - A vector whose entries indicate whether the episode has ended. - - infos : list of dict - A list of auxiliary diagnostic information. - """ - self._assert_is_running() - if self._state != AsyncState.WAITING_ROLLOUT: - raise NoAsyncCallError('Calling `rollout_wait` without any prior call ' - 'to `rollout_async`.', AsyncState.WAITING_ROLLOUT.value) - - if not self._poll(timeout): - self._state = AsyncState.DEFAULT - raise mp.TimeoutError('The call to `rollout_wait` has timed out after ' - '{0} second{1}.'.format(timeout, 's' if timeout > 1 else '')) - - results, successes = zip(*[pipe.recv() for pipe in self.parent_pipes]) - results = [r for r in results if r is not None] - self._raise_if_errors(successes) - self._state = AsyncState.DEFAULT - - rewards, infos = [_flatten_list(r) for r in zip(*results)] - - # for now, we ignore the observations and only return the rewards - - # if not self.shared_memory: - # self.observations = concatenate(observations_list, self.observations, - # self.single_observation_space) - - # return (deepcopy(self.observations) if self.copy else self.observations, - # np.array(rewards), np.array(dones, dtype=np.bool_), infos) - - return np.array(rewards), infos - - def rollout(self, actions, contexts): - self.rollout_async(actions, contexts) - return self.rollout_wait() - - -def _flatten_obs(obs): - assert isinstance(obs, (list, tuple)) - assert len(obs) > 0 - - if isinstance(obs[0], dict): - keys = obs[0].keys() - return {k: np.stack([o[k] for o in obs]) for k in keys} - else: - return np.stack(obs) - - -def _flatten_list(l): - assert isinstance(l, (list, tuple)) - assert len(l) > 0 - assert all([len(l_) > 0 for l_ in l]) - - return [l__ for l_ in l for l__ in l_] diff --git a/alr_envs/utils/legacy/dmp_env_wrapper.py b/alr_envs/utils/legacy/dmp_env_wrapper.py deleted file mode 100644 index 6835d80..0000000 --- a/alr_envs/utils/legacy/dmp_env_wrapper.py +++ /dev/null @@ -1,125 +0,0 @@ -from alr_envs.utils.policies import get_policy_class -from mp_lib.phase import ExpDecayPhaseGenerator -from mp_lib.basis import DMPBasisGenerator -from mp_lib import dmps -import numpy as np -import gym - - -class DmpEnvWrapper(gym.Wrapper): - def __init__(self, - env, - num_dof, - num_basis, - start_pos=None, - final_pos=None, - duration=1, - dt=0.01, - alpha_phase=2, - bandwidth_factor=3, - learn_goal=False, - post_traj_time=0., - policy_type=None, - weights_scale=1., - goal_scale=1., - ): - super(DmpEnvWrapper, self).__init__(env) - self.num_dof = num_dof - self.num_basis = num_basis - self.dim = num_dof * num_basis - if learn_goal: - self.dim += num_dof - self.learn_goal = learn_goal - self.duration = duration # seconds - time_steps = int(duration / dt) - self.t = np.linspace(0, duration, time_steps) - self.post_traj_steps = int(post_traj_time / dt) - - phase_generator = ExpDecayPhaseGenerator(alpha_phase=alpha_phase, duration=duration) - basis_generator = DMPBasisGenerator(phase_generator, - duration=duration, - num_basis=self.num_basis, - basis_bandwidth_factor=bandwidth_factor) - - self.dmp = dmps.DMP(num_dof=num_dof, - basis_generator=basis_generator, - phase_generator=phase_generator, - num_time_steps=time_steps, - dt=dt - ) - - self.dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) - - dmp_weights = np.zeros((num_basis, num_dof)) - if learn_goal: - dmp_goal_pos = np.zeros(num_dof) - else: - dmp_goal_pos = final_pos - - self.dmp.set_weights(dmp_weights, dmp_goal_pos) - self.weights_scale = weights_scale - self.goal_scale = goal_scale - - policy_class = get_policy_class(policy_type) - self.policy = policy_class(env) - - def __call__(self, params, contexts=None): - params = np.atleast_2d(params) - rewards = [] - infos = [] - for p, c in zip(params, contexts): - reward, info = self.rollout(p, c) - rewards.append(reward) - infos.append(info) - - return np.array(rewards), infos - - def goal_and_weights(self, params): - if len(params.shape) > 1: - assert params.shape[1] == self.dim - else: - assert len(params) == self.dim - params = np.reshape(params, [1, self.dim]) - - if self.learn_goal: - goal_pos = params[0, -self.num_dof:] - weight_matrix = np.reshape(params[:, :-self.num_dof], [self.num_basis, self.num_dof]) - else: - goal_pos = self.dmp.dmp_goal_pos.flatten() - assert goal_pos is not None - weight_matrix = np.reshape(params, [self.num_basis, self.num_dof]) - - return goal_pos * self.goal_scale, weight_matrix * self.weights_scale - - def rollout(self, params, context=None, render=False): - """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" - goal_pos, weight_matrix = self.goal_and_weights(params) - self.dmp.set_weights(weight_matrix, goal_pos) - trajectory, velocity = self.dmp.reference_trajectory(self.t) - - if self.post_traj_steps > 0: - trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])]) - velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.num_dof))]) - - self._trajectory = trajectory - self._velocity = velocity - - rews = [] - infos = [] - - self.env.configure(context) - self.env.reset() - - for t, pos_vel in enumerate(zip(trajectory, velocity)): - ac = self.policy.get_action(pos_vel[0], pos_vel[1]) - obs, rew, done, info = self.env.step(ac) - rews.append(rew) - infos.append(info) - if render: - self.env.render(mode="human") - if done: - break - - reward = np.sum(rews) - - return reward, info diff --git a/alr_envs/utils/legacy/dmp_env_wrapper_example.py b/alr_envs/utils/legacy/dmp_env_wrapper_example.py deleted file mode 100644 index d2edae5..0000000 --- a/alr_envs/utils/legacy/dmp_env_wrapper_example.py +++ /dev/null @@ -1,28 +0,0 @@ -from alr_envs.utils.legacy.utils import make_holereacher_env -import numpy as np - -if __name__ == "__main__": - n_samples = 1 - n_cpus = 4 - dim = 30 - - # env = DmpAsyncVectorEnv([make_viapointreacher_env(i) for i in range(n_cpus)], - # n_samples=n_samples) - - test_env = make_holereacher_env(0)() - - # params = np.random.randn(n_samples, dim) - params = np.array([[1.386102, -3.29980525, 4.70402733, 1.3966668, 0.73774902, - 3.14676681, -4.98644416, 6.20303193, 1.30502127, -0.09330522, - 7.62656797, -5.76893033, 3.4706711, -0.6944142, -3.33442788, - 12.31421548, -0.72760271, -6.9090723, 7.02903814, -8.7236836, - 1.4805914, 0.53185824, -5.46626893, 0.69692163, 13.58472666, - 0.77199316, 2.02906724, -3.0203244, -1.00533159, -0.57417351]]) - - # params = np.hstack([50 * np.random.randn(n_samples, 25), np.tile(np.array([np.pi/2, -np.pi/4, -np.pi/4, -np.pi/4, -np.pi/4]), [n_samples, 1])]) - - rew, info = test_env.rollout(params, render=True) - print(rew) - - # out = env(params) - # print(out) diff --git a/alr_envs/utils/legacy/dmp_pd_control_example.py b/alr_envs/utils/legacy/dmp_pd_control_example.py deleted file mode 100644 index 3b713f3..0000000 --- a/alr_envs/utils/legacy/dmp_pd_control_example.py +++ /dev/null @@ -1,28 +0,0 @@ -from alr_envs.mujoco.ball_in_a_cup.utils import make_simple_dmp_env -import numpy as np - -if __name__ == "__main__": - - dim = 15 - n_cpus = 4 - - # n_samples = 10 - # - # vec_env = DmpAsyncVectorEnv([make_simple_env(i) for i in range(n_cpus)], - # n_samples=n_samples) - # - # params = np.tile(1 * np.random.randn(n_samples, dim), (10, 1)) - # - # rewards, infos = vec_env(params) - # print(rewards) - # - non_vec_env = make_simple_dmp_env(0, 0)() - - # params = 0.5 * np.random.randn(dim) - params = np.array([-2.63357598, -1.04950296, -0.44330737, 0.52950017, 4.29247739, - 4.52473661, -0.05685977, -0.76796851, 3.71540499, 1.22631059, - 2.20412438, 3.91588129, -0.12652723, -3.0788211 , 0.56204464]) - - out2 = non_vec_env.rollout(params, render=True ) - - print(out2) diff --git a/alr_envs/utils/legacy/utils.py b/alr_envs/utils/legacy/utils.py deleted file mode 100644 index e96eee6..0000000 --- a/alr_envs/utils/legacy/utils.py +++ /dev/null @@ -1,145 +0,0 @@ -import alr_envs.classic_control.hole_reacher as hr -import alr_envs.classic_control.viapoint_reacher as vpr -from alr_envs.utils.mps.dmp_wrapper import DmpWrapper -from alr_envs.utils.mps.detpmp_wrapper import DetPMPWrapper -import numpy as np - - -def make_viapointreacher_env(rank, seed=0): - """ - Utility function for multiprocessed env. - - :param env_id: (str) the environment ID - :param num_env: (int) the number of environments you wish to have in subprocesses - :param seed: (int) the initial seed for RNG - :param rank: (int) index of the subprocess - :returns a function that generates an environment - """ - - def _init(): - _env = vpr.ViaPointReacher(n_links=5, - allow_self_collision=False, - collision_penalty=1000) - - _env = DmpWrapper(_env, - num_dof=5, - num_basis=5, - duration=2, - alpha_phase=2.5, - dt=_env.dt, - start_pos=_env.start_pos, - learn_goal=False, - policy_type="velocity", - weights_scale=50) - _env.seed(seed + rank) - return _env - - return _init - - -def make_holereacher_env(rank, seed=0): - """ - Utility function for multiprocessed env. - - :param env_id: (str) the environment ID - :param num_env: (int) the number of environments you wish to have in subprocesses - :param seed: (int) the initial seed for RNG - :param rank: (int) index of the subprocess - :returns a function that generates an environment - """ - - def _init(): - _env = hr.HoleReacherEnv(n_links=5, - allow_self_collision=False, - allow_wall_collision=False, - hole_width=0.25, - hole_depth=1, - hole_x=2, - collision_penalty=100) - - _env = DmpWrapper(_env, - num_dof=5, - num_basis=5, - duration=2, - bandwidth_factor=2, - dt=_env.dt, - learn_goal=True, - alpha_phase=2, - start_pos=_env._start_pos, - policy_type="velocity", - weights_scale=50, - goal_scale=0.1 - ) - - _env.seed(seed + rank) - return _env - - return _init - - -def make_holereacher_fix_goal_env(rank, seed=0): - """ - Utility function for multiprocessed env. - - :param env_id: (str) the environment ID - :param num_env: (int) the number of environments you wish to have in subprocesses - :param seed: (int) the initial seed for RNG - :param rank: (int) index of the subprocess - :returns a function that generates an environment - """ - - def _init(): - _env = hr.HoleReacherEnv(n_links=5, - allow_self_collision=False, - allow_wall_collision=False, - hole_width=0.15, - hole_depth=1, - hole_x=1, - collision_penalty=100) - - _env = DmpWrapper(_env, - num_dof=5, - num_basis=5, - duration=2, - dt=_env.dt, - learn_goal=False, - final_pos=np.array([2.02669572, -1.25966385, -1.51618198, -0.80946476, 0.02012344]), - alpha_phase=2, - start_pos=_env._start_pos, - policy_type="velocity", - weights_scale=50, - goal_scale=1 - ) - - _env.seed(seed + rank) - return _env - - return _init - - -def make_holereacher_env_pmp(rank, seed=0): - """ - Utility function for multiprocessed env. - - :param env_id: (str) the environment ID - :param num_env: (int) the number of environments you wish to have in subprocesses - :param seed: (int) the initial seed for RNG - :param rank: (int) index of the subprocess - :returns a function that generates an environment - """ - - def _init(): - _env = hr.HoleReacherEnv(n_links=5, - allow_self_collision=False, - allow_wall_collision=False, - hole_width=0.15, - hole_depth=1, - hole_x=1, - collision_penalty=1000) - - _env = DetPMPWrapper(_env, num_dof=5, num_basis=5, width=0.02, duration=2, dt=_env.dt, post_traj_time=0, - policy_type="velocity", weights_scale=0.2, zero_start=True, zero_goal=False) - _env.seed(seed + rank) - return _env - - return _init From 7695cae07694915136b7051f965e9d84593378de Mon Sep 17 00:00:00 2001 From: ottofabian Date: Tue, 18 May 2021 10:39:30 +0200 Subject: [PATCH 08/11] default observation selection for MPs --- alr_envs/utils/mps/mp_environments.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/alr_envs/utils/mps/mp_environments.py b/alr_envs/utils/mps/mp_environments.py index 948e4ba..f397491 100644 --- a/alr_envs/utils/mps/mp_environments.py +++ b/alr_envs/utils/mps/mp_environments.py @@ -10,17 +10,17 @@ class MPEnv(gym.Env): @property @abstractmethod def active_obs(self): - """Returns boolean value for each observation entry - whether the observation is returned by the DMP for the contextual case or not. + """Returns boolean mask for each observation entry + whether the observation is returned for the contextual case or not. This effectively allows to filter unwanted or unnecessary observations from the full step-based case. """ - raise NotImplementedError() + return np.ones(self.observation_space.shape, dtype=bool) @property @abstractmethod def start_pos(self) -> Union[float, int, np.ndarray]: """ - Returns the current position of the joints + Returns the starting position of the joints """ raise NotImplementedError() From e7dcdf38f1a65abccf8ec48355663d97217b8995 Mon Sep 17 00:00:00 2001 From: ottofabian Date: Tue, 18 May 2021 10:49:08 +0200 Subject: [PATCH 09/11] removed hole_reacher_v2.py --- alr_envs/classic_control/hole_reacher_v2.py | 307 -------------------- 1 file changed, 307 deletions(-) delete mode 100644 alr_envs/classic_control/hole_reacher_v2.py diff --git a/alr_envs/classic_control/hole_reacher_v2.py b/alr_envs/classic_control/hole_reacher_v2.py deleted file mode 100644 index 028ab26..0000000 --- a/alr_envs/classic_control/hole_reacher_v2.py +++ /dev/null @@ -1,307 +0,0 @@ -import gym -import numpy as np -import matplotlib.pyplot as plt -from matplotlib import patches -from alr_envs.classic_control.utils import check_self_collision - - -class HoleReacher(gym.Env): - - def __init__(self, n_links, hole_x, hole_width, hole_depth, allow_self_collision=False, - allow_wall_collision=False, collision_penalty=1000): - - self.n_links = n_links - self.link_lengths = np.ones((n_links, 1)) - - # task - self.hole_x = hole_x # x-position of center of hole - self.hole_width = hole_width # width of hole - self.hole_depth = hole_depth # depth of hole - - self.bottom_center_of_hole = np.hstack([hole_x, -hole_depth]) - self.top_center_of_hole = np.hstack([hole_x, 0]) - self.left_wall_edge = np.hstack([hole_x - self.hole_width / 2, 0]) - self.right_wall_edge = np.hstack([hole_x + self.hole_width / 2, 0]) - - # collision - self.allow_self_collision = allow_self_collision - self.allow_wall_collision = allow_wall_collision - self.collision_penalty = collision_penalty - - # state - self._joints = None - self._joint_angles = None - self._angle_velocity = None - self.start_pos = np.hstack([[np.pi / 2], np.zeros(self.n_links - 1)]) - self.start_vel = np.zeros(self.n_links) - - self.dt = 0.01 - # self.time_limit = 2 - - action_bound = np.pi * np.ones((self.n_links,)) - state_bound = np.hstack([ - [np.pi] * self.n_links, # cos - [np.pi] * self.n_links, # sin - [np.inf] * self.n_links, # velocity - [np.inf] * 2, # x-y coordinates of target distance - [np.inf] # env steps, because reward start after n steps TODO: Maybe - ]) - self.action_space = gym.spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape) - self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) - - self.fig = None - rect_1 = patches.Rectangle((-self.n_links, -1), - self.n_links + self.hole_x - self.hole_width / 2, 1, - fill=True, edgecolor='k', facecolor='k') - rect_2 = patches.Rectangle((self.hole_x + self.hole_width / 2, -1), - self.n_links - self.hole_x + self.hole_width / 2, 1, - fill=True, edgecolor='k', facecolor='k') - rect_3 = patches.Rectangle((self.hole_x - self.hole_width / 2, -1), self.hole_width, - 1 - self.hole_depth, - fill=True, edgecolor='k', facecolor='k') - - rect_4 = patches.Rectangle((-1, 0), # south west corner - 0.5, # width - self.n_links, # height - fill=True, edgecolor='k', facecolor='k') - - self.patches = [rect_1, rect_2, rect_3, rect_4] - - @property - def end_effector(self): - return self._joints[self.n_links].T - - def configure(self, context): - pass - - def reset(self): - self._joint_angles = self.start_pos - self._angle_velocity = self.start_vel - self._joints = np.zeros((self.n_links + 1, 2)) - self._update_joints() - self._steps = 0 - - return self._get_obs().copy() - - def step(self, action: np.ndarray): - """ - a single step with an action in joint velocity space - """ - vel = action # + 0.01 * np.random.randn(self.num_links) - acc = (vel - self._angle_velocity) / self.dt - self._angle_velocity = vel - self._joint_angles = self._joint_angles + self.dt * self._angle_velocity - - self._update_joints() - - # rew = self._reward() - - # compute reward directly in step function - - success = False - reward = 0 - if not self._is_collided: - if self._steps == 199: - dist = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) - reward = - dist ** 2 - success = dist < 0.005 - else: - dist = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) - # if self.collision_penalty != 0: - # reward = -self.collision_penalty - # else: - reward = - dist ** 2 - self.collision_penalty - - reward -= 5e-8 * np.sum(acc ** 2) - - info = {"is_collided": self._is_collided, "is_success": success} - - self._steps += 1 - - # done = self._steps * self.dt > self.time_limit or self._is_collided - done = self._is_collided - - return self._get_obs().copy(), reward, done, info - - def _update_joints(self): - """ - update _joints to get new end effector position. The other links are only required for rendering. - Returns: - - """ - line_points_in_taskspace = self.get_forward_kinematics(num_points_per_link=20) - - self._joints[1:, 0] = self._joints[0, 0] + line_points_in_taskspace[:, -1, 0] - self._joints[1:, 1] = self._joints[0, 1] + line_points_in_taskspace[:, -1, 1] - - self_collision = False - wall_collision = False - - if not self.allow_self_collision: - self_collision = check_self_collision(line_points_in_taskspace) - if np.any(np.abs(self._joint_angles) > np.pi) and not self.allow_self_collision: - self_collision = True - - if not self.allow_wall_collision: - wall_collision = self.check_wall_collision(line_points_in_taskspace) - - self._is_collided = self_collision or wall_collision - - def _get_obs(self): - theta = self._joint_angles - return np.hstack([ - np.cos(theta), - np.sin(theta), - self._angle_velocity, - self.end_effector - self.bottom_center_of_hole, - self._steps - ]) - - def get_forward_kinematics(self, num_points_per_link=1): - theta = self._joint_angles[:, None] - - if num_points_per_link > 1: - intermediate_points = np.linspace(0, 1, num_points_per_link) - else: - intermediate_points = 1 - - accumulated_theta = np.cumsum(theta, axis=0) - - endeffector = np.zeros(shape=(self.n_links, num_points_per_link, 2)) - - x = np.cos(accumulated_theta) * self.link_lengths * intermediate_points - y = np.sin(accumulated_theta) * self.link_lengths * intermediate_points - - endeffector[0, :, 0] = x[0, :] - endeffector[0, :, 1] = y[0, :] - - for i in range(1, self.n_links): - endeffector[i, :, 0] = x[i, :] + endeffector[i - 1, -1, 0] - endeffector[i, :, 1] = y[i, :] + endeffector[i - 1, -1, 1] - - return np.squeeze(endeffector + self._joints[0, :]) - - def check_wall_collision(self, line_points): - - # all points that are before the hole in x - r, c = np.where((line_points[:, :, 0] > -1) & (line_points[:, :, 0] < -0.5) & - (line_points[:, :, 1] > 0) & (line_points[:, :, 1] < self.n_links)) - - if len(r) > 0: - return True - - # all points that are before the hole in x - r, c = np.where(line_points[:, :, 0] < (self.hole_x - self.hole_width / 2)) - - # check if any of those points are below surface - nr_line_points_below_surface_before_hole = np.sum(line_points[r, c, 1] < 0) - - if nr_line_points_below_surface_before_hole > 0: - return True - - # all points that are after the hole in x - r, c = np.where(line_points[:, :, 0] > (self.hole_x + self.hole_width / 2)) - - # check if any of those points are below surface - nr_line_points_below_surface_after_hole = np.sum(line_points[r, c, 1] < 0) - - if nr_line_points_below_surface_after_hole > 0: - return True - - # all points that are above the hole - r, c = np.where((line_points[:, :, 0] > (self.hole_x - self.hole_width / 2)) & ( - line_points[:, :, 0] < (self.hole_x + self.hole_width / 2))) - - # check if any of those points are below surface - nr_line_points_below_surface_in_hole = np.sum(line_points[r, c, 1] < -self.hole_depth) - - if nr_line_points_below_surface_in_hole > 0: - return True - - return False - - def render(self, mode='human'): - if self.fig is None: - self.fig = plt.figure() - # plt.ion() - # plt.pause(0.01) - else: - plt.figure(self.fig.number) - - if mode == "human": - plt.cla() - plt.title(f"Iteration: {self._steps}, distance: {self.end_effector - self.bottom_center_of_hole}") - - # Arm - plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') - - # Add the patch to the Axes - [plt.gca().add_patch(rect) for rect in self.patches] - - lim = np.sum(self.link_lengths) + 0.5 - plt.xlim([-lim, lim]) - plt.ylim([-1.1, lim]) - # plt.draw() - plt.pause(1e-4) # pushes window to foreground, which is annoying. - # self.fig.canvas.flush_events() - - elif mode == "partial": - if self._steps == 1: - # fig, ax = plt.subplots() - # Add the patch to the Axes - [plt.gca().add_patch(rect) for rect in self.patches] - # plt.pause(0.01) - - if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided: - # Arm - plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k', alpha=self._steps / 200) - # ax.plot(line_points_in_taskspace[:, 0, 0], - # line_points_in_taskspace[:, 0, 1], - # line_points_in_taskspace[:, -1, 0], - # line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200) - - lim = np.sum(self.link_lengths) + 0.5 - plt.xlim([-lim, lim]) - plt.ylim([-1.1, lim]) - plt.pause(0.01) - - elif mode == "final": - if self._steps == 199 or self._is_collided: - # fig, ax = plt.subplots() - - # Add the patch to the Axes - [plt.gca().add_patch(rect) for rect in self.patches] - - plt.xlim(-self.n_links, self.n_links), plt.ylim(-1, self.n_links) - # Arm - plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') - - plt.pause(0.01) - - def close(self): - if self.fig is not None: - plt.close(self.fig) - - -if __name__ == '__main__': - nl = 5 - render_mode = "human" # "human" or "partial" or "final" - env = HoleReacher(n_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, - hole_depth=1, hole_x=1) - env.reset() - # env.render(mode=render_mode) - - for i in range(200): - # objective.load_result("/tmp/cma") - # test with random actions - ac = 2 * env.action_space.sample() - # ac[0] += np.pi/2 - obs, rew, d, info = env.step(ac) - env.render(mode=render_mode) - - print(rew) - - if d: - break - - env.close() From 6169ede44943ee6f6c5f037aa3f0c648963a1cfe Mon Sep 17 00:00:00 2001 From: ottofabian Date: Tue, 18 May 2021 10:50:30 +0200 Subject: [PATCH 10/11] removed episodic_simple_reacher.py --- alr_envs/classic_control/__init__.py | 1 - .../episodic_simple_reacher.py | 46 ------------------- 2 files changed, 47 deletions(-) delete mode 100644 alr_envs/classic_control/episodic_simple_reacher.py diff --git a/alr_envs/classic_control/__init__.py b/alr_envs/classic_control/__init__.py index fde30fc..4a26eaa 100644 --- a/alr_envs/classic_control/__init__.py +++ b/alr_envs/classic_control/__init__.py @@ -1,4 +1,3 @@ from alr_envs.classic_control.simple_reacher import SimpleReacherEnv -from alr_envs.classic_control.episodic_simple_reacher import EpisodicSimpleReacherEnv from alr_envs.classic_control.viapoint_reacher import ViaPointReacher from alr_envs.classic_control.hole_reacher import HoleReacherEnv diff --git a/alr_envs/classic_control/episodic_simple_reacher.py b/alr_envs/classic_control/episodic_simple_reacher.py deleted file mode 100644 index ed498ce..0000000 --- a/alr_envs/classic_control/episodic_simple_reacher.py +++ /dev/null @@ -1,46 +0,0 @@ -from alr_envs.classic_control.simple_reacher import SimpleReacherEnv -from gym import spaces -import numpy as np - - -class EpisodicSimpleReacherEnv(SimpleReacherEnv): - def __init__(self, n_links, random_start=True): - super(EpisodicSimpleReacherEnv, self).__init__(n_links, random_start) - - # self._goal_pos = None - - if random_start: - state_bound = np.hstack([ - [np.pi] * self.n_links, # cos - [np.pi] * self.n_links, # sin - [np.inf] * self.n_links, # velocity - ]) - else: - state_bound = np.empty(0, ) - - state_bound = np.hstack([ - state_bound, - [np.inf] * 2, # x-y coordinates of goal - ]) - - self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) - - @property - def init_qpos(self): - return self._start_pos - - # @property - # def goal_pos(self): - # return self._goal_pos - - def _get_obs(self): - if self.random_start: - theta = self._joint_angles - return np.hstack([ - np.cos(theta), - np.sin(theta), - self._angle_velocity, - self._goal, - ]) - else: - return self._goal From e331803230341bba2ae5d25f167b052e16b98b6d Mon Sep 17 00:00:00 2001 From: ottofabian Date: Tue, 18 May 2021 10:53:30 +0200 Subject: [PATCH 11/11] changed import --- alr_envs/mujoco/reacher/alr_reacher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/alr_envs/mujoco/reacher/alr_reacher.py b/alr_envs/mujoco/reacher/alr_reacher.py index c6cca16..e27e069 100644 --- a/alr_envs/mujoco/reacher/alr_reacher.py +++ b/alr_envs/mujoco/reacher/alr_reacher.py @@ -2,12 +2,12 @@ import os import numpy as np from gym import utils -from gym.envs.mujoco import mujoco_env +from gym.envs.mujoco import MujocoEnv import alr_envs.utils.utils as alr_utils -class ALRReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle): +class ALRReacherEnv(MujocoEnv, utils.EzPickle): def __init__(self, steps_before_reward=200, n_links=5, balance=False): utils.EzPickle.__init__(**locals()) @@ -31,7 +31,7 @@ class ALRReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle): else: raise ValueError(f"Invalid number of links {n_links}, only 5 or 7 allowed.") - mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", file_name), 2) + MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", file_name), 2) def step(self, a): self._steps += 1