fancy_gym/alr_envs/classic_control/simple_reacher.py

from typing import Iterable, Union

import matplotlib.pyplot as plt
import numpy as np
from gym import spaces
from gym.utils import seeding

from alr_envs.utils.mps.mp_environments import MPEnv


class SimpleReacherEnv(MPEnv):
    """
    Simple Reaching Task without any physics simulation.
    Returns no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions
    towards the end of the trajectory.
    """

    def __init__(self, n_links: int, target: Union[None, Iterable] = None, random_start: bool = True):
        super().__init__()
        self.link_lengths = np.ones(n_links)
        self.n_links = n_links
        self.dt = 0.1

        self.random_start = random_start

        self._joints = None
        self._joint_angles = None
        self._angle_velocity = None
        self._start_pos = np.zeros(self.n_links)
        self._start_vel = np.zeros(self.n_links)

        self._target = target  # provided target value
        self._goal = None  # updated goal value, does not change when target != None

        self.max_torque = 1
        self.steps_before_reward = 199

        action_bound = np.ones((self.n_links,)) * self.max_torque
        state_bound = np.hstack([
            [np.pi] * self.n_links,  # cos
            [np.pi] * self.n_links,  # sin
            [np.inf] * self.n_links,  # velocity
            [np.inf] * 2,  # x-y coordinates of target distance
            [np.inf]  # env steps, because reward start after n steps TODO: Maybe
        ])
        self.action_space = spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape)
        self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape)

        # containers for plotting
        self.metadata = {'render.modes': ["human"]}
        self.fig = None

        self._steps = 0
        self.seed()

    def step(self, action: np.ndarray):
        """
        A single step with action in torque space
        """

        # action = self._add_action_noise(action)
        ac = np.clip(action, -self.max_torque, self.max_torque)

        self._angle_velocity = self._angle_velocity + self.dt * ac
        self._joint_angles = self._joint_angles + self.dt * self._angle_velocity
        self._update_joints()

        reward, info = self._get_reward(action)

        self._steps += 1
        done = False

        return self._get_obs().copy(), reward, done, info

    def reset(self):

        # TODO: maybe do initialisation more random?
        # Sample only orientation of first link, i.e. the arm is always straight.
        if self.random_start:
            self._joint_angles = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)])
            self._start_pos = self._joint_angles.copy()
        else:
            self._joint_angles = self._start_pos

        self._generate_goal()

        self._angle_velocity = self._start_vel
        self._joints = np.zeros((self.n_links + 1, 2))
        self._update_joints()
        self._steps = 0

        return self._get_obs().copy()

    def _update_joints(self):
        """
        update joints to get new end-effector position. The other links are only required for rendering.
        Returns:

        """
        angles = np.cumsum(self._joint_angles)
        x = self.link_lengths * np.vstack([np.cos(angles), np.sin(angles)])
        self._joints[1:] = self._joints[0] + np.cumsum(x.T, axis=0)

    def _get_reward(self, action: np.ndarray):
        diff = self.end_effector - self._goal
        reward_dist = 0

        if self._steps >= self.steps_before_reward:
            reward_dist -= np.linalg.norm(diff)
            # reward_dist = np.exp(-0.1 * diff ** 2).mean()
            # reward_dist = - (diff ** 2).mean()

        reward_ctrl = (action ** 2).sum()
        reward = reward_dist - reward_ctrl
        return reward, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)

    def _get_obs(self):
        theta = self._joint_angles
        return np.hstack([
            np.cos(theta),
            np.sin(theta),
            self._angle_velocity,
            self.end_effector - self._goal,
            self._steps
        ])

    def _generate_goal(self):

        if self._target is None:
            # center = self._joints[0]
            # # Sample uniformly in circle with radius R around center of reacher.
            # R = np.sum(self.link_lengths)
            # r = R * np.sqrt(self.np_random.uniform())
            # theta = self.np_random.uniform() * 2 * np.pi
            # goal = center + r * np.stack([np.cos(theta), np.sin(theta)])

            total_length = np.sum(self.link_lengths)
            goal = np.array([total_length, total_length])
            while np.linalg.norm(goal) >= total_length:
                goal = self.np_random.uniform(low=-total_length, high=total_length, size=2)
        else:
            goal = np.copy(self._target)

        self._goal = goal

    def render(self, mode='human'):  # pragma: no cover
        if self.fig is None:
            # Create base figure once on the beginning. Afterwards only update
            plt.ion()
            self.fig = plt.figure()
            ax = self.fig.add_subplot(1, 1, 1)

            # limits
            lim = np.sum(self.link_lengths) + 0.5
            ax.set_xlim([-lim, lim])
            ax.set_ylim([-lim, lim])

            self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k')
            goal_pos = self._goal.T
            self.goal_point, = ax.plot(goal_pos[0], goal_pos[1], 'gx')
            self.goal_dist, = ax.plot([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]], 'g--')

            self.fig.show()

        self.fig.gca().set_title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}")

        # goal
        goal_pos = self._goal.T
        if self._steps == 1:
            self.goal_point.set_data(goal_pos[0], goal_pos[1])

        # arm
        self.line.set_data(self._joints[:, 0], self._joints[:, 1])

        # distance between end effector and goal
        self.goal_dist.set_data([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]])

        self.fig.canvas.draw()
        self.fig.canvas.flush_events()

    @property
    def active_obs(self):
        return np.hstack([
            [self.random_start] * self.n_links,  # cos
            [self.random_start] * self.n_links,  # sin
            [self.random_start] * self.n_links,  # velocity
            [True] * 2,  # x-y coordinates of target distance
            [False]  # env steps
        ])

    @property
    def start_pos(self):
        return self._start_pos

    @property
    def goal_pos(self):
        raise ValueError("Goal position is not available and has to be learnt based on the environment.")

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def close(self):
        del self.fig

    @property
    def end_effector(self):
        return self._joints[self.n_links].T


if __name__ == '__main__':
    nl = 5
    render_mode = "human"  # "human" or "partial" or "final"
    env = SimpleReacherEnv(n_links=nl)
    obs = env.reset()
    print("First", obs)

    for i in range(2000):
        # objective.load_result("/tmp/cma")
        # test with random actions
        ac = 2 * env.action_space.sample()
        # ac = np.ones(env.action_space.shape)
        obs, rew, d, info = env.step(ac)
        env.render(mode=render_mode)

        print(obs[env.active_obs].shape)

        if d or i % 200 == 0:
            env.reset()

    env.close()
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`from typing import Iterable, Union`

added simple reacher task 2020-08-28 18:31:06 +02:00			`import matplotlib.pyplot as plt`
removed EZPickle from SimpleReacher 2020-09-01 17:57:51 +02:00			`import numpy as np`
			`from gym import spaces`
			`from gym.utils import seeding`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`from alr_envs.utils.mps.mp_environments import MPEnv`
added simple reacher task 2020-08-28 18:31:06 +02:00

adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`class SimpleReacherEnv(MPEnv):`
added simple reacher task 2020-08-28 18:31:06 +02:00			`"""`
			`Simple Reaching Task without any physics simulation.`
			`Returns no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions`
			`towards the end of the trajectory.`
			`"""`

adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`def __init__(self, n_links: int, target: Union[None, Iterable] = None, random_start: bool = True):`
added simple reacher task 2020-08-28 18:31:06 +02:00			`super().__init__()`
			`self.link_lengths = np.ones(n_links)`
			`self.n_links = n_links`
			`self.dt = 0.1`

start contextual dmp wrapper 2021-05-07 09:51:53 +02:00			`self.random_start = random_start`

fixed some issues with SimpleReacher state space 2020-08-31 10:33:11 +02:00			`self._joints = None`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._joint_angles = None`
added simple reacher task 2020-08-28 18:31:06 +02:00			`self._angle_velocity = None`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._start_pos = np.zeros(self.n_links)`
			`self._start_vel = np.zeros(self.n_links)`

			`self._target = target # provided target value`
			`self._goal = None # updated goal value, does not change when target != None`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.max_torque = 1`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`self.steps_before_reward = 199`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`action_bound = np.ones((self.n_links,)) * self.max_torque`
added simple reacher task 2020-08-28 18:31:06 +02:00			`state_bound = np.hstack([`
SimpleReacher state space changed 2020-08-31 11:26:32 +02:00			`[np.pi] * self.n_links, # cos`
			`[np.pi] * self.n_links, # sin`
			`[np.inf] * self.n_links, # velocity`
fixed some issues with SimpleReacher state space 2020-08-31 10:33:11 +02:00			`[np.inf] * 2, # x-y coordinates of target distance`
SimpleReacher state space changed 2020-08-31 11:26:32 +02:00			`[np.inf] # env steps, because reward start after n steps TODO: Maybe`
added simple reacher task 2020-08-28 18:31:06 +02:00			`])`
			`self.action_space = spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape)`
			`self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape)`

adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`# containers for plotting`
added simple reacher task 2020-08-28 18:31:06 +02:00			`self.metadata = {'render.modes': ["human"]}`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.fig = None`
added simple reacher task 2020-08-28 18:31:06 +02:00
			`self._steps = 0`
			`self.seed()`

removed action scaling 2020-09-08 12:43:14 +02:00			`def step(self, action: np.ndarray):`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`"""`
			`A single step with action in torque space`
			`"""`
added simple reacher task 2020-08-28 18:31:06 +02:00
removed action scaling 2020-09-08 12:43:14 +02:00			`# action = self._add_action_noise(action)`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`ac = np.clip(action, -self.max_torque, self.max_torque)`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._angle_velocity = self._angle_velocity + self.dt * ac`
			`self._joint_angles = self._joint_angles + self.dt * self._angle_velocity`
added simple reacher task 2020-08-28 18:31:06 +02:00			`self._update_joints()`

some changes to reward 2020-08-31 15:51:47 +02:00			`reward, info = self._get_reward(action)`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._steps += 1`
added simple reacher task 2020-08-28 18:31:06 +02:00			`done = False`

some changes to reward 2020-08-31 15:51:47 +02:00			`return self._get_obs().copy(), reward, done, info`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`def reset(self):`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`# TODO: maybe do initialisation more random?`
			`# Sample only orientation of first link, i.e. the arm is always straight.`
			`if self.random_start:`
			`self._joint_angles = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)])`
			`self._start_pos = self._joint_angles.copy()`
			`else:`
			`self._joint_angles = self._start_pos`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._generate_goal()`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._angle_velocity = self._start_vel`
			`self._joints = np.zeros((self.n_links + 1, 2))`
			`self._update_joints()`
			`self._steps = 0`

			`return self._get_obs().copy()`
added simple reacher task 2020-08-28 18:31:06 +02:00
			`def _update_joints(self):`
			`"""`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`update joints to get new end-effector position. The other links are only required for rendering.`
added simple reacher task 2020-08-28 18:31:06 +02:00			`Returns:`

			`"""`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`angles = np.cumsum(self._joint_angles)`
added simple reacher task 2020-08-28 18:31:06 +02:00			`x = self.link_lengths * np.vstack([np.cos(angles), np.sin(angles)])`
fixed some issues with SimpleReacher state space 2020-08-31 10:33:11 +02:00			`self._joints[1:] = self._joints[0] + np.cumsum(x.T, axis=0)`
added simple reacher task 2020-08-28 18:31:06 +02:00
removed action scaling 2020-09-08 12:43:14 +02:00			`def _get_reward(self, action: np.ndarray):`
wip 2021-05-10 12:17:52 +02:00			`diff = self.end_effector - self._goal`
some changes to reward 2020-08-31 15:51:47 +02:00			`reward_dist = 0`
added simple reacher task 2020-08-28 18:31:06 +02:00
SimpleReacher state space changed 2020-08-31 11:26:32 +02:00			`if self._steps >= self.steps_before_reward:`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`reward_dist -= np.linalg.norm(diff)`
some changes to reward 2020-08-31 15:51:47 +02:00			`# reward_dist = np.exp(-0.1 * diff ** 2).mean()`
			`# reward_dist = - (diff ** 2).mean()`
added simple reacher task 2020-08-28 18:31:06 +02:00
some changes to reward 2020-08-31 15:51:47 +02:00			`reward_ctrl = (action ** 2).sum()`
			`reward = reward_dist - reward_ctrl`
			`return reward, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`def _get_obs(self):`
			`theta = self._joint_angles`
			`return np.hstack([`
			`np.cos(theta),`
			`np.sin(theta),`
			`self._angle_velocity,`
			`self.end_effector - self._goal,`
			`self._steps`
			`])`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`def _generate_goal(self):`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`if self._target is None:`
			`# center = self._joints[0]`
			`# # Sample uniformly in circle with radius R around center of reacher.`
			`# R = np.sum(self.link_lengths)`
			`# r = R * np.sqrt(self.np_random.uniform())`
			`# theta = self.np_random.uniform() * 2 * np.pi`
			`# goal = center + r * np.stack([np.cos(theta), np.sin(theta)])`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`total_length = np.sum(self.link_lengths)`
			`goal = np.array([total_length, total_length])`
			`while np.linalg.norm(goal) >= total_length:`
			`goal = self.np_random.uniform(low=-total_length, high=total_length, size=2)`
			`else:`
			`goal = np.copy(self._target)`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._goal = goal`
added simple reacher task 2020-08-28 18:31:06 +02:00
			`def render(self, mode='human'): # pragma: no cover`
			`if self.fig is None:`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`# Create base figure once on the beginning. Afterwards only update`
added simple reacher task 2020-08-28 18:31:06 +02:00			`plt.ion()`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.fig = plt.figure()`
			`ax = self.fig.add_subplot(1, 1, 1)`

			`# limits`
			`lim = np.sum(self.link_lengths) + 0.5`
			`ax.set_xlim([-lim, lim])`
			`ax.set_ylim([-lim, lim])`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k')`
			`goal_pos = self._goal.T`
			`self.goal_point, = ax.plot(goal_pos[0], goal_pos[1], 'gx')`
			`self.goal_dist, = ax.plot([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]], 'g--')`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.fig.show()`

			`self.fig.gca().set_title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}")`
added simple reacher task 2020-08-28 18:31:06 +02:00
			`# goal`
wip 2021-05-10 12:17:52 +02:00			`goal_pos = self._goal.T`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`if self._steps == 1:`
			`self.goal_point.set_data(goal_pos[0], goal_pos[1])`

			`# arm`
			`self.line.set_data(self._joints[:, 0], self._joints[:, 1])`

added simple reacher task 2020-08-28 18:31:06 +02:00			`# distance between end effector and goal`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.goal_dist.set_data([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]])`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.fig.canvas.draw()`
SimpleReacher state space changed 2020-08-31 11:26:32 +02:00			`self.fig.canvas.flush_events()`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`@property`
			`def active_obs(self):`
			`return np.hstack([`
			`[self.random_start] * self.n_links, # cos`
			`[self.random_start] * self.n_links, # sin`
			`[self.random_start] * self.n_links, # velocity`
			`[True] * 2, # x-y coordinates of target distance`
			`[False] # env steps`
			`])`

			`@property`
			`def start_pos(self):`
			`return self._start_pos`

			`@property`
			`def goal_pos(self):`
			`raise ValueError("Goal position is not available and has to be learnt based on the environment.")`

			`def seed(self, seed=None):`
			`self.np_random, seed = seeding.np_random(seed)`
			`return [seed]`

added simple reacher task 2020-08-28 18:31:06 +02:00			`def close(self):`
			`del self.fig`

			`@property`
			`def end_effector(self):`
fixed some issues with SimpleReacher state space 2020-08-31 10:33:11 +02:00			`return self._joints[self.n_links].T`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00

			`if __name__ == '__main__':`
			`nl = 5`
			`render_mode = "human" # "human" or "partial" or "final"`
			`env = SimpleReacherEnv(n_links=nl)`
			`obs = env.reset()`
			`print("First", obs)`

			`for i in range(2000):`
			`# objective.load_result("/tmp/cma")`
			`# test with random actions`
			`ac = 2 * env.action_space.sample()`
			`# ac = np.ones(env.action_space.shape)`
			`obs, rew, d, info = env.step(ac)`
			`env.render(mode=render_mode)`

			`print(obs[env.active_obs].shape)`

			`if d or i % 200 == 0:`
			`env.reset()`

			`env.close()`