From 104281fe16b561a3e203f9a7445fe3bc3038a3a3 Mon Sep 17 00:00:00 2001 From: Maximilian Huettenrauch Date: Tue, 12 Jan 2021 10:52:08 +0100 Subject: [PATCH] changed from step to rollout method --- alr_envs/classic_control/hole_reacher.py | 139 +++++++++++++---------- alr_envs/utils/dmp_async_vec_env.py | 7 +- alr_envs/utils/dmp_env_wrapper.py | 42 +++++-- setup.py | 2 +- 4 files changed, 117 insertions(+), 73 deletions(-) diff --git a/alr_envs/classic_control/hole_reacher.py b/alr_envs/classic_control/hole_reacher.py index ddf754b..441e0cf 100644 --- a/alr_envs/classic_control/hole_reacher.py +++ b/alr_envs/classic_control/hole_reacher.py @@ -1,5 +1,7 @@ import gym import numpy as np +import matplotlib +matplotlib.use('TkAgg') import matplotlib.pyplot as plt from matplotlib import patches @@ -35,6 +37,7 @@ class HoleReacher(gym.Env): self._angle_velocity = None self.start_pos = np.hstack([[np.pi/2], np.zeros(self.num_links - 1)]) self.start_vel = np.zeros(self.num_links) + self.weight_matrix_scale = 50 # for the holereacher, the dmp weights become quite large compared to the values of the goal attractor. this scaling is to ensure they are on similar scale for the optimizer self._dt = 0.01 @@ -66,37 +69,6 @@ class HoleReacher(gym.Env): def end_effector(self): return self._joints[self.num_links].T - def step(self, action): - # vel = (action - self._joint_angles) / self._dt - # acc = (vel - self._angle_velocity) / self._dt - # self._angle_velocity = vel - # self._joint_angles = action - - vel = action - acc = (vel - self._angle_velocity) / self._dt - self._angle_velocity = vel - self._joint_angles = self._joint_angles + self._dt * self._angle_velocity - - self._update_joints() - - rew = self._reward() - - rew -= 1e-6 * np.sum(acc**2) - - if self._steps == 180: - rew -= (0.1 * np.sum(vel**2) ** 2 - + 1e-3 * np.sum(action**2) - ) - - if self._is_collided: - rew -= self.collision_penalty - - info = {} - - self._steps += 1 - - return self._get_obs().copy(), rew, self._is_collided, info - def reset(self): self._joint_angles = self.start_pos self._angle_velocity = self.start_vel @@ -106,6 +78,46 @@ class HoleReacher(gym.Env): return self._get_obs().copy() + def step(self, action): + """ + a single step with an action in joint velocity space + """ + vel = action + acc = (vel - self._angle_velocity) / self._dt + self._angle_velocity = vel + self._joint_angles = self._joint_angles + self._dt * self._angle_velocity + + self._update_joints() + + # rew = self._reward() + + # compute reward directly in step function + + dist_reward = 0 + if not self._is_collided: + if self._steps == 180: + dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) + else: + dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) + + reward = - dist_reward ** 2 + + reward -= 1e-6 * np.sum(acc**2) + + if self._steps == 180: + reward -= (0.1 * np.sum(vel**2) ** 2 + + 1e-3 * np.sum(action**2) + ) + + if self._is_collided: + reward -= self.collision_penalty + + info = {} + + self._steps += 1 + + return self._get_obs().copy(), reward, self._is_collided, info + def _update_joints(self): """ update _joints to get new end effector position. The other links are only required for rendering. @@ -140,18 +152,17 @@ class HoleReacher(gym.Env): self._steps ]) - def _reward(self): - dist_reward = 0 - if not self._is_collided: - if self._steps == 180: - dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) - else: - dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) - - # TODO: make negative - out = - dist_reward ** 2 - - return out + # def _reward(self): + # dist_reward = 0 + # if not self._is_collided: + # if self._steps == 180: + # dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) + # else: + # dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) + # + # out = - dist_reward ** 2 + # + # return out def get_forward_kinematics(self, num_points_per_link=1): theta = self._joint_angles[:, None] @@ -220,8 +231,8 @@ class HoleReacher(gym.Env): def render(self, mode='human'): if self.fig is None: self.fig = plt.figure() - plt.ion() - plt.pause(0.01) + # plt.ion() + # plt.pause(0.01) else: plt.figure(self.fig.number) @@ -242,25 +253,29 @@ class HoleReacher(gym.Env): plt.pause(1e-4) # pushes window to foreground, which is annoying. # self.fig.canvas.flush_events() - elif render_partial: - if t == 0: - fig, ax = plt.subplots() + elif mode == "partial": + if self._steps == 1: + # fig, ax = plt.subplots() # Add the patch to the Axes [plt.gca().add_patch(rect) for rect in self.patches] + # plt.pause(0.01) - plt.xlim(-self.num_links, self.num_links), plt.ylim(-1, self.num_links) - - if t % 20 == 0 or t == 199 or is_collided: - ax.plot(line_points_in_taskspace[:, 0, 0], - line_points_in_taskspace[:, 0, 1], - line_points_in_taskspace[:, -1, 0], - line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200) + if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided: + # Arm + plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k', alpha=self._steps / 200) + # ax.plot(line_points_in_taskspace[:, 0, 0], + # line_points_in_taskspace[:, 0, 1], + # line_points_in_taskspace[:, -1, 0], + # line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200) + lim = np.sum(self.link_lengths) + 0.5 + plt.xlim([-lim, lim]) + plt.ylim([-1.1, lim]) plt.pause(0.01) - elif render_final: - if self._steps == 0 or self._is_collided: - fig, ax = plt.subplots() + elif mode == "final": + if self._steps == 199 or self._is_collided: + # fig, ax = plt.subplots() # Add the patch to the Axes [plt.gca().add_patch(rect) for rect in self.patches] @@ -274,16 +289,18 @@ class HoleReacher(gym.Env): if __name__ == '__main__': nl = 5 + render_mode = "human" # "human" or "partial" or "final" env = HoleReacher(num_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, hole_depth=1, hole_x=1) env.reset() + # env.render(mode=render_mode) for i in range(200): # objective.load_result("/tmp/cma") - # explore = np.random.multivariate_normal(mean=np.zeros(30), cov=1 * np.eye(30)) - ac = 11 * env.action_space.sample() + # test with random actions + ac = 2 * env.action_space.sample() # ac[0] += np.pi/2 obs, rew, done, info = env.step(ac) - env.render(mode="render_full") + env.render(mode=render_mode) print(rew) diff --git a/alr_envs/utils/dmp_async_vec_env.py b/alr_envs/utils/dmp_async_vec_env.py index 3cf48af..f576277 100644 --- a/alr_envs/utils/dmp_async_vec_env.py +++ b/alr_envs/utils/dmp_async_vec_env.py @@ -4,7 +4,6 @@ from gym.vector.utils import concatenate, create_empty_array from gym.vector.async_vector_env import AsyncState import numpy as np import multiprocessing as mp -from copy import deepcopy import sys @@ -41,7 +40,6 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv): 'for a pending call to `{0}` to complete.'.format( self._state.value), self._state.value) - # split_actions = np.array_split(actions, self.num_envs) actions = np.atleast_2d(actions) split_actions = np.array_split(actions, np.minimum(len(actions), self.num_envs)) for pipe, action in zip(self.parent_pipes, split_actions): @@ -89,6 +87,8 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv): observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)] + # for now, we ignore the observations and only return the rewards + # if not self.shared_memory: # self.observations = concatenate(observations_list, self.observations, # self.single_observation_space) @@ -124,8 +124,7 @@ def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue): dones = [] infos = [] for d in data: - env.reset() - observation, reward, done, info = env.step(d) + observation, reward, done, info = env.rollout(d) observations.append(observation) rewards.append(reward) dones.append(done) diff --git a/alr_envs/utils/dmp_env_wrapper.py b/alr_envs/utils/dmp_env_wrapper.py index 63d09ce..cf73ae5 100644 --- a/alr_envs/utils/dmp_env_wrapper.py +++ b/alr_envs/utils/dmp_env_wrapper.py @@ -14,7 +14,7 @@ class DmpEnvWrapperBase(gym.Wrapper): if learn_goal: self.dim += num_dof self.learn_goal = True - duration = duration # seconds + self.duration = duration # seconds time_steps = int(duration / dt) self.t = np.linspace(0, duration, time_steps) @@ -35,6 +35,21 @@ class DmpEnvWrapperBase(gym.Wrapper): self.dmp.set_weights(dmp_weights, dmp_goal_pos) + def __call__(self, params): + params = np.atleast_2d(params) + observations = [] + rewards = [] + dones = [] + infos = [] + for p in params: + observation, reward, done, info = self.rollout(p) + observations.append(observation) + rewards.append(reward) + dones.append(done) + infos.append(info) + + return np.array(rewards) + def goal_and_weights(self, params): if len(params.shape) > 1: assert params.shape[1] == self.dim @@ -51,19 +66,26 @@ class DmpEnvWrapperBase(gym.Wrapper): return goal_pos, weight_matrix - def step(self, action, render=False): - """ overwrite step function where action now is the weights and possible goal position""" + def rollout(self, params, render=False): + """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" raise NotImplementedError class DmpEnvWrapperAngle(DmpEnvWrapperBase): - def step(self, action, render=False): + """ + Wrapper for gym environments which creates a trajectory in joint angle space + """ + def rollout(self, action, render=False): goal_pos, weight_matrix = self.goal_and_weights(action) + if hasattr(self.env, "weight_matrix_scale"): + weight_matrix = weight_matrix * self.env.weight_matrix_scale self.dmp.set_weights(weight_matrix, goal_pos) trajectory, velocities = self.dmp.reference_trajectory(self.t) rews = [] + self.env.reset() + for t, traj in enumerate(trajectory): obs, rew, done, info = self.env.step(traj) rews.append(rew) @@ -73,21 +95,27 @@ class DmpEnvWrapperAngle(DmpEnvWrapperBase): break reward = np.sum(rews) - done = True + # done = True info = {} return obs, reward, done, info class DmpEnvWrapperVel(DmpEnvWrapperBase): - def step(self, action, render=False): + """ + Wrapper for gym environments which creates a trajectory in joint velocity space + """ + def rollout(self, action, render=False): goal_pos, weight_matrix = self.goal_and_weights(action) - weight_matrix *= 50 + if hasattr(self.env, "weight_matrix_scale"): + weight_matrix = weight_matrix * self.env.weight_matrix_scale self.dmp.set_weights(weight_matrix, goal_pos) trajectory, velocities = self.dmp.reference_trajectory(self.t) rews = [] + self.env.reset() + for t, vel in enumerate(velocities): obs, rew, done, info = self.env.step(vel) rews.append(rew) diff --git a/setup.py b/setup.py index 81f949d..6f30786 100644 --- a/setup.py +++ b/setup.py @@ -2,5 +2,5 @@ from setuptools import setup setup(name='alr_envs', version='0.0.1', - install_requires=['gym', 'PyQt5'] # And any other dependencies foo needs + install_requires=['gym', 'PyQt5', 'matplotlib'] # And any other dependencies foo needs )