changed from step to rollout method

2021-01-12 10:52:08 +01:00 · 2021-01-12 10:52:08 +01:00 · 104281fe16
commit 104281fe16
parent a8fcbd6fb0
4 changed files with 117 additions and 73 deletions
--- a/alr_envs/classic_control/hole_reacher.py
+++ b/alr_envs/classic_control/hole_reacher.py
@ -1,5 +1,7 @@
 import gym
 import numpy as np
+import matplotlib
+matplotlib.use('TkAgg')
 import matplotlib.pyplot as plt
 from matplotlib import patches

@ -35,6 +37,7 @@ class HoleReacher(gym.Env):
        self._angle_velocity = None
        self.start_pos = np.hstack([[np.pi/2], np.zeros(self.num_links - 1)])
        self.start_vel = np.zeros(self.num_links)
+        self.weight_matrix_scale = 50  # for the holereacher, the dmp weights become quite large compared to the values of the goal attractor. this scaling is to ensure they are on similar scale for the optimizer

        self._dt = 0.01

@ -66,37 +69,6 @@ class HoleReacher(gym.Env):
    def end_effector(self):
        return self._joints[self.num_links].T

-    def step(self, action):
-        # vel = (action - self._joint_angles) / self._dt
-        # acc = (vel - self._angle_velocity) / self._dt
-        # self._angle_velocity = vel
-        # self._joint_angles = action
-
-        vel = action
-        acc = (vel - self._angle_velocity) / self._dt
-        self._angle_velocity = vel
-        self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
-
-        self._update_joints()
-
-        rew = self._reward()
-
-        rew -= 1e-6 * np.sum(acc**2)
-
-        if self._steps == 180:
-            rew -= (0.1 * np.sum(vel**2) ** 2
-                    + 1e-3 * np.sum(action**2)
-                    )
-
-        if self._is_collided:
-            rew -= self.collision_penalty
-
-        info = {}
-
-        self._steps += 1
-
-        return self._get_obs().copy(), rew, self._is_collided, info
-
    def reset(self):
        self._joint_angles = self.start_pos
        self._angle_velocity = self.start_vel
@ -106,6 +78,46 @@ class HoleReacher(gym.Env):

        return self._get_obs().copy()

+    def step(self, action):
+        """
+        a single step with an action in joint velocity space
+        """
+        vel = action
+        acc = (vel - self._angle_velocity) / self._dt
+        self._angle_velocity = vel
+        self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
+
+        self._update_joints()
+
+        # rew = self._reward()
+
+        # compute reward directly in step function
+
+        dist_reward = 0
+        if not self._is_collided:
+            if self._steps == 180:
+                dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
+        else:
+            dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
+
+        reward = - dist_reward ** 2
+
+        reward -= 1e-6 * np.sum(acc**2)
+
+        if self._steps == 180:
+            reward -= (0.1 * np.sum(vel**2) ** 2
+                       + 1e-3 * np.sum(action**2)
+                       )
+
+        if self._is_collided:
+            reward -= self.collision_penalty
+
+        info = {}
+
+        self._steps += 1
+
+        return self._get_obs().copy(), reward, self._is_collided, info
+
    def _update_joints(self):
        """
        update _joints to get new end effector position. The other links are only required for rendering.
@ -140,18 +152,17 @@ class HoleReacher(gym.Env):
            self._steps
        ])

-    def _reward(self):
-        dist_reward = 0
-        if not self._is_collided:
-            if self._steps == 180:
-                dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
-        else:
-            dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
-
-        # TODO: make negative
-        out = - dist_reward ** 2
-
-        return out
+    # def _reward(self):
+    #     dist_reward = 0
+    #     if not self._is_collided:
+    #         if self._steps == 180:
+    #             dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
+    #     else:
+    #         dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
+    #
+    #     out = - dist_reward ** 2
+    #
+    #     return out

    def get_forward_kinematics(self, num_points_per_link=1):
        theta = self._joint_angles[:, None]
@ -220,8 +231,8 @@ class HoleReacher(gym.Env):
    def render(self, mode='human'):
        if self.fig is None:
            self.fig = plt.figure()
-            plt.ion()
-            plt.pause(0.01)
+            # plt.ion()
+            # plt.pause(0.01)
        else:
            plt.figure(self.fig.number)

@ -242,25 +253,29 @@ class HoleReacher(gym.Env):
            plt.pause(1e-4) #  pushes window to foreground, which is annoying.
            # self.fig.canvas.flush_events()

-        elif render_partial:
-            if t == 0:
-                fig, ax = plt.subplots()
+        elif mode == "partial":
+            if self._steps == 1:
+                # fig, ax = plt.subplots()
                # Add the patch to the Axes
                [plt.gca().add_patch(rect) for rect in self.patches]
+                # plt.pause(0.01)

-                plt.xlim(-self.num_links, self.num_links), plt.ylim(-1, self.num_links)
-
-            if t % 20 == 0 or t == 199 or is_collided:
-                ax.plot(line_points_in_taskspace[:, 0, 0],
-                        line_points_in_taskspace[:, 0, 1],
-                        line_points_in_taskspace[:, -1, 0],
-                        line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200)
+            if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided:
+                # Arm
+                plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k', alpha=self._steps / 200)
+                # ax.plot(line_points_in_taskspace[:, 0, 0],
+                #         line_points_in_taskspace[:, 0, 1],
+                #         line_points_in_taskspace[:, -1, 0],
+                #         line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200)

+                lim = np.sum(self.link_lengths) + 0.5
+                plt.xlim([-lim, lim])
+                plt.ylim([-1.1, lim])
                plt.pause(0.01)

-        elif render_final:
-            if self._steps == 0 or self._is_collided:
-                fig, ax = plt.subplots()
+        elif mode == "final":
+            if self._steps == 199 or self._is_collided:
+                # fig, ax = plt.subplots()

                # Add the patch to the Axes
                [plt.gca().add_patch(rect) for rect in self.patches]
@ -274,16 +289,18 @@ class HoleReacher(gym.Env):

 if __name__ == '__main__':
    nl = 5
+    render_mode = "human"  # "human" or "partial" or "final"
    env = HoleReacher(num_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, hole_depth=1, hole_x=1)
    env.reset()
+    # env.render(mode=render_mode)

    for i in range(200):
        # objective.load_result("/tmp/cma")
-        # explore = np.random.multivariate_normal(mean=np.zeros(30), cov=1 * np.eye(30))
-        ac = 11 * env.action_space.sample()
+        # test with random actions
+        ac = 2 * env.action_space.sample()
        # ac[0] += np.pi/2
        obs, rew, done, info = env.step(ac)
-        env.render(mode="render_full")
+        env.render(mode=render_mode)

        print(rew)

--- a/alr_envs/utils/dmp_async_vec_env.py
+++ b/alr_envs/utils/dmp_async_vec_env.py
@ -4,7 +4,6 @@ from gym.vector.utils import concatenate, create_empty_array
 from gym.vector.async_vector_env import AsyncState
 import numpy as np
 import multiprocessing as mp
-from copy import deepcopy
 import sys


@ -41,7 +40,6 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
                                          'for a pending call to `{0}` to complete.'.format(
                self._state.value), self._state.value)

-        # split_actions = np.array_split(actions, self.num_envs)
        actions = np.atleast_2d(actions)
        split_actions = np.array_split(actions, np.minimum(len(actions), self.num_envs))
        for pipe, action in zip(self.parent_pipes, split_actions):
@ -89,6 +87,8 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):

        observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)]

+        # for now, we ignore the observations and only return the rewards
+
        # if not self.shared_memory:
        #     self.observations = concatenate(observations_list, self.observations,
        #                                     self.single_observation_space)
@ -124,8 +124,7 @@ def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
                dones = []
                infos = []
                for d in data:
-                    env.reset()
-                    observation, reward, done, info = env.step(d)
+                    observation, reward, done, info = env.rollout(d)
                    observations.append(observation)
                    rewards.append(reward)
                    dones.append(done)
--- a/alr_envs/utils/dmp_env_wrapper.py
+++ b/alr_envs/utils/dmp_env_wrapper.py
@ -14,7 +14,7 @@ class DmpEnvWrapperBase(gym.Wrapper):
        if learn_goal:
            self.dim += num_dof
        self.learn_goal = True
-        duration = duration   # seconds
+        self.duration = duration   # seconds
        time_steps = int(duration / dt)
        self.t = np.linspace(0, duration, time_steps)

@ -35,6 +35,21 @@ class DmpEnvWrapperBase(gym.Wrapper):

        self.dmp.set_weights(dmp_weights, dmp_goal_pos)

+    def __call__(self, params):
+        params = np.atleast_2d(params)
+        observations = []
+        rewards = []
+        dones = []
+        infos = []
+        for p in params:
+            observation, reward, done, info = self.rollout(p)
+            observations.append(observation)
+            rewards.append(reward)
+            dones.append(done)
+            infos.append(info)
+
+        return np.array(rewards)
+
    def goal_and_weights(self, params):
        if len(params.shape) > 1:
            assert params.shape[1] == self.dim
@ -51,19 +66,26 @@ class DmpEnvWrapperBase(gym.Wrapper):

        return goal_pos, weight_matrix

-    def step(self, action, render=False):
-        """ overwrite step function where action now is the weights and possible goal position"""
+    def rollout(self, params, render=False):
+        """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step"""
        raise NotImplementedError


 class DmpEnvWrapperAngle(DmpEnvWrapperBase):
-    def step(self, action, render=False):
+    """
+    Wrapper for gym environments which creates a trajectory in joint angle space
+    """
+    def rollout(self, action, render=False):
        goal_pos, weight_matrix = self.goal_and_weights(action)
+        if hasattr(self.env, "weight_matrix_scale"):
+            weight_matrix = weight_matrix * self.env.weight_matrix_scale
        self.dmp.set_weights(weight_matrix, goal_pos)
        trajectory, velocities = self.dmp.reference_trajectory(self.t)

        rews = []

+        self.env.reset()
+
        for t, traj in enumerate(trajectory):
            obs, rew, done, info = self.env.step(traj)
            rews.append(rew)
@ -73,21 +95,27 @@ class DmpEnvWrapperAngle(DmpEnvWrapperBase):
                break

        reward = np.sum(rews)
-        done = True
+        # done = True
        info = {}

        return obs, reward, done, info


 class DmpEnvWrapperVel(DmpEnvWrapperBase):
-    def step(self, action, render=False):
+    """
+    Wrapper for gym environments which creates a trajectory in joint velocity space
+    """
+    def rollout(self, action, render=False):
        goal_pos, weight_matrix = self.goal_and_weights(action)
-        weight_matrix *= 50
+        if hasattr(self.env, "weight_matrix_scale"):
+            weight_matrix = weight_matrix * self.env.weight_matrix_scale
        self.dmp.set_weights(weight_matrix, goal_pos)
        trajectory, velocities = self.dmp.reference_trajectory(self.t)

        rews = []

+        self.env.reset()
+
        for t, vel in enumerate(velocities):
            obs, rew, done, info = self.env.step(vel)
            rews.append(rew)
--- a/setup.py
+++ b/setup.py
@ -2,5 +2,5 @@ from setuptools import setup

 setup(name='alr_envs',
      version='0.0.1',
-      install_requires=['gym', 'PyQt5']  # And any other dependencies foo needs
+      install_requires=['gym', 'PyQt5', 'matplotlib']  # And any other dependencies foo needs
      )