changed from step to rollout method

2021-01-12 10:52:08 +01:00 · 2021-01-12 10:52:08 +01:00 · 104281fe16
commit 104281fe16
parent a8fcbd6fb0
4 changed files with 117 additions and 73 deletions
--- a/alr_envs/classic_control/hole_reacher.py
+++ b/alr_envs/classic_control/hole_reacher.py
@ -1,5 +1,7 @@
 import gym
 import numpy as np
 import matplotlib
 matplotlib.use('TkAgg')
 import matplotlib.pyplot as plt
 from matplotlib import patches
@ -35,6 +37,7 @@ class HoleReacher(gym.Env):
        self._angle_velocity = None
        self.start_pos = np.hstack([[np.pi/2], np.zeros(self.num_links - 1)])
        self.start_vel = np.zeros(self.num_links)
        self.weight_matrix_scale = 50  # for the holereacher, the dmp weights become quite large compared to the values of the goal attractor. this scaling is to ensure they are on similar scale for the optimizer
        self._dt = 0.01
@ -66,37 +69,6 @@ class HoleReacher(gym.Env):
    def end_effector(self):
        return self._joints[self.num_links].T
    def step(self, action):
        # vel = (action - self._joint_angles) / self._dt
        # acc = (vel - self._angle_velocity) / self._dt
        # self._angle_velocity = vel
        # self._joint_angles = action
        vel = action
        acc = (vel - self._angle_velocity) / self._dt
        self._angle_velocity = vel
        self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
        self._update_joints()
        rew = self._reward()
        rew -= 1e-6 * np.sum(acc**2)
        if self._steps == 180:
            rew -= (0.1 * np.sum(vel**2) ** 2
                    + 1e-3 * np.sum(action**2)
                    )
        if self._is_collided:
            rew -= self.collision_penalty
        info = {}
        self._steps += 1
        return self._get_obs().copy(), rew, self._is_collided, info
    def reset(self):
        self._joint_angles = self.start_pos
        self._angle_velocity = self.start_vel
@ -106,6 +78,46 @@ class HoleReacher(gym.Env):
        return self._get_obs().copy()
    def step(self, action):
        """
        a single step with an action in joint velocity space
        """
        vel = action
        acc = (vel - self._angle_velocity) / self._dt
        self._angle_velocity = vel
        self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
        self._update_joints()
        # rew = self._reward()
        # compute reward directly in step function
        dist_reward = 0
        if not self._is_collided:
            if self._steps == 180:
                dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
        else:
            dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
        reward = - dist_reward ** 2
        reward -= 1e-6 * np.sum(acc**2)
        if self._steps == 180:
            reward -= (0.1 * np.sum(vel**2) ** 2
                       + 1e-3 * np.sum(action**2)
                       )
        if self._is_collided:
            reward -= self.collision_penalty
        info = {}
        self._steps += 1
        return self._get_obs().copy(), reward, self._is_collided, info
    def _update_joints(self):
        """
        update _joints to get new end effector position. The other links are only required for rendering.
@ -140,18 +152,17 @@ class HoleReacher(gym.Env):
            self._steps
        ])
-    def _reward(self):
+    # def _reward(self):
-        dist_reward = 0
+    #     dist_reward = 0
-        if not self._is_collided:
+    #     if not self._is_collided:
-            if self._steps == 180:
+    #         if self._steps == 180:
-                dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
+    #             dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
-        else:
+    #     else:
-            dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
+    #         dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
-
+    #
-        # TODO: make negative
+    #     out = - dist_reward ** 2
-        out = - dist_reward ** 2
+    #
-
+    #     return out
        return out
    def get_forward_kinematics(self, num_points_per_link=1):
        theta = self._joint_angles[:, None]
@ -220,8 +231,8 @@ class HoleReacher(gym.Env):
    def render(self, mode='human'):
        if self.fig is None:
            self.fig = plt.figure()
-            plt.ion()
+            # plt.ion()
-            plt.pause(0.01)
+            # plt.pause(0.01)
        else:
            plt.figure(self.fig.number)
@ -242,25 +253,29 @@ class HoleReacher(gym.Env):
            plt.pause(1e-4) #  pushes window to foreground, which is annoying.
            # self.fig.canvas.flush_events()
-        elif render_partial:
+        elif mode == "partial":
-            if t == 0:
+            if self._steps == 1:
-                fig, ax = plt.subplots()
+                # fig, ax = plt.subplots()
                # Add the patch to the Axes
                [plt.gca().add_patch(rect) for rect in self.patches]
                # plt.pause(0.01)
-                plt.xlim(-self.num_links, self.num_links), plt.ylim(-1, self.num_links)
+            if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided:
-
+                # Arm
-            if t % 20 == 0 or t == 199 or is_collided:
+                plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k', alpha=self._steps / 200)
-                ax.plot(line_points_in_taskspace[:, 0, 0],
+                # ax.plot(line_points_in_taskspace[:, 0, 0],
-                        line_points_in_taskspace[:, 0, 1],
+                #         line_points_in_taskspace[:, 0, 1],
-                        line_points_in_taskspace[:, -1, 0],
+                #         line_points_in_taskspace[:, -1, 0],
-                        line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200)
+                #         line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200)
                lim = np.sum(self.link_lengths) + 0.5
                plt.xlim([-lim, lim])
                plt.ylim([-1.1, lim])
                plt.pause(0.01)
-        elif render_final:
+        elif mode == "final":
-            if self._steps == 0 or self._is_collided:
+            if self._steps == 199 or self._is_collided:
-                fig, ax = plt.subplots()
+                # fig, ax = plt.subplots()
                # Add the patch to the Axes
                [plt.gca().add_patch(rect) for rect in self.patches]
@ -274,16 +289,18 @@ class HoleReacher(gym.Env):
 if __name__ == '__main__':
    nl = 5
    render_mode = "human"  # "human" or "partial" or "final"
    env = HoleReacher(num_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, hole_depth=1, hole_x=1)
    env.reset()
    # env.render(mode=render_mode)
    for i in range(200):
        # objective.load_result("/tmp/cma")
-        # explore = np.random.multivariate_normal(mean=np.zeros(30), cov=1 * np.eye(30))
+        # test with random actions
-        ac = 11 * env.action_space.sample()
+        ac = 2 * env.action_space.sample()
        # ac[0] += np.pi/2
        obs, rew, done, info = env.step(ac)
-        env.render(mode="render_full")
+        env.render(mode=render_mode)
        print(rew)
--- a/alr_envs/utils/dmp_async_vec_env.py
+++ b/alr_envs/utils/dmp_async_vec_env.py
@ -4,7 +4,6 @@ from gym.vector.utils import concatenate, create_empty_array
 from gym.vector.async_vector_env import AsyncState
 import numpy as np
 import multiprocessing as mp
 from copy import deepcopy
 import sys
@ -41,7 +40,6 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
                                          'for a pending call to `{0}` to complete.'.format(
                self._state.value), self._state.value)
        # split_actions = np.array_split(actions, self.num_envs)
        actions = np.atleast_2d(actions)
        split_actions = np.array_split(actions, np.minimum(len(actions), self.num_envs))
        for pipe, action in zip(self.parent_pipes, split_actions):
@ -89,6 +87,8 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
        observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)]
        # for now, we ignore the observations and only return the rewards
        # if not self.shared_memory:
        #     self.observations = concatenate(observations_list, self.observations,
        #                                     self.single_observation_space)
@ -124,8 +124,7 @@ def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
                dones = []
                infos = []
                for d in data:
-                    env.reset()
+                    observation, reward, done, info = env.rollout(d)
                    observation, reward, done, info = env.step(d)
                    observations.append(observation)
                    rewards.append(reward)
                    dones.append(done)
--- a/alr_envs/utils/dmp_env_wrapper.py
+++ b/alr_envs/utils/dmp_env_wrapper.py
@ -14,7 +14,7 @@ class DmpEnvWrapperBase(gym.Wrapper):
        if learn_goal:
            self.dim += num_dof
        self.learn_goal = True
-        duration = duration   # seconds
+        self.duration = duration   # seconds
        time_steps = int(duration / dt)
        self.t = np.linspace(0, duration, time_steps)
@ -35,6 +35,21 @@ class DmpEnvWrapperBase(gym.Wrapper):
        self.dmp.set_weights(dmp_weights, dmp_goal_pos)
    def __call__(self, params):
        params = np.atleast_2d(params)
        observations = []
        rewards = []
        dones = []
        infos = []
        for p in params:
            observation, reward, done, info = self.rollout(p)
            observations.append(observation)
            rewards.append(reward)
            dones.append(done)
            infos.append(info)
        return np.array(rewards)
    def goal_and_weights(self, params):
        if len(params.shape) > 1:
            assert params.shape[1] == self.dim
@ -51,19 +66,26 @@ class DmpEnvWrapperBase(gym.Wrapper):
        return goal_pos, weight_matrix
-    def step(self, action, render=False):
+    def rollout(self, params, render=False):
-        """ overwrite step function where action now is the weights and possible goal position"""
+        """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step"""
        raise NotImplementedError
 class DmpEnvWrapperAngle(DmpEnvWrapperBase):
-    def step(self, action, render=False):
+    """
    Wrapper for gym environments which creates a trajectory in joint angle space
    """
    def rollout(self, action, render=False):
        goal_pos, weight_matrix = self.goal_and_weights(action)
        if hasattr(self.env, "weight_matrix_scale"):
            weight_matrix = weight_matrix * self.env.weight_matrix_scale
        self.dmp.set_weights(weight_matrix, goal_pos)
        trajectory, velocities = self.dmp.reference_trajectory(self.t)
        rews = []
        self.env.reset()
        for t, traj in enumerate(trajectory):
            obs, rew, done, info = self.env.step(traj)
            rews.append(rew)
@ -73,21 +95,27 @@ class DmpEnvWrapperAngle(DmpEnvWrapperBase):
                break
        reward = np.sum(rews)
-        done = True
+        # done = True
        info = {}
        return obs, reward, done, info
 class DmpEnvWrapperVel(DmpEnvWrapperBase):
-    def step(self, action, render=False):
+    """
    Wrapper for gym environments which creates a trajectory in joint velocity space
    """
    def rollout(self, action, render=False):
        goal_pos, weight_matrix = self.goal_and_weights(action)
-        weight_matrix *= 50
+        if hasattr(self.env, "weight_matrix_scale"):
            weight_matrix = weight_matrix * self.env.weight_matrix_scale
        self.dmp.set_weights(weight_matrix, goal_pos)
        trajectory, velocities = self.dmp.reference_trajectory(self.t)
        rews = []
        self.env.reset()
        for t, vel in enumerate(velocities):
            obs, rew, done, info = self.env.step(vel)
            rews.append(rew)
--- a/setup.py
+++ b/setup.py
@ -2,5 +2,5 @@ from setuptools import setup
 setup(name='alr_envs',
      version='0.0.1',
-      install_requires=['gym', 'PyQt5']  # And any other dependencies foo needs
+      install_requires=['gym', 'PyQt5', 'matplotlib']  # And any other dependencies foo needs
      )