fancy_gym/alr_envs/classic_control/simple_reacher/simple_reacher.py

from typing import Iterable, Union

import gym
import matplotlib.pyplot as plt
import numpy as np
from gym import spaces
from gym.utils import seeding


class SimpleReacherEnv(gym.Env):
    """
    Simple Reaching Task without any physics simulation.
    Returns no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions
    towards the end of the trajectory.
    """

    def __init__(self, n_links: int, target: Union[None, Iterable] = None, random_start: bool = True):
        super().__init__()
        self.link_lengths = np.ones(n_links)
        self.n_links = n_links
        self._dt = 0.1

        self.random_start = random_start

        # provided initial parameters
        self.inital_target = target

        # temp container for current env state
        self._goal = None

        self._joints = None
        self._joint_angles = None
        self._angle_velocity = None
        self._start_pos = np.zeros(self.n_links)
        self._start_vel = np.zeros(self.n_links)

        self.max_torque = 1
        self.steps_before_reward = 199

        action_bound = np.ones((self.n_links,)) * self.max_torque
        state_bound = np.hstack([
            [np.pi] * self.n_links,  # cos
            [np.pi] * self.n_links,  # sin
            [np.inf] * self.n_links,  # velocity
            [np.inf] * 2,  # x-y coordinates of target distance
            [np.inf]  # env steps, because reward start after n steps TODO: Maybe
        ])
        self.action_space = spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape)
        self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape)

        # containers for plotting
        self.metadata = {'render.modes': ["human"]}
        self.fig = None

        self._steps = 0
        self.seed()

    @property
    def dt(self) -> Union[float, int]:
        return self._dt

    @property
    def start_pos(self):
        return self._start_pos

    def step(self, action: np.ndarray):
        """
        A single step with action in torque space
        """

        # action = self._add_action_noise(action)
        ac = np.clip(action, -self.max_torque, self.max_torque)

        self._angle_velocity = self._angle_velocity + self.dt * ac
        self._joint_angles = self._joint_angles + self.dt * self._angle_velocity
        self._update_joints()

        reward, info = self._get_reward(action)

        self._steps += 1
        done = False

        return self._get_obs().copy(), reward, done, info

    def reset(self):

        # TODO: maybe do initialisation more random?
        # Sample only orientation of first link, i.e. the arm is always straight.
        if self.random_start:
            self._joint_angles = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)])
            self._start_pos = self._joint_angles.copy()
        else:
            self._joint_angles = self._start_pos

        self._generate_goal()

        self._angle_velocity = self._start_vel
        self._joints = np.zeros((self.n_links + 1, 2))
        self._update_joints()
        self._steps = 0

        return self._get_obs().copy()

    def _update_joints(self):
        """
        update joints to get new end-effector position. The other links are only required for rendering.
        Returns:

        """
        angles = np.cumsum(self._joint_angles)
        x = self.link_lengths * np.vstack([np.cos(angles), np.sin(angles)])
        self._joints[1:] = self._joints[0] + np.cumsum(x.T, axis=0)

    def _get_reward(self, action: np.ndarray):
        diff = self.end_effector - self._goal
        reward_dist = 0

        if self._steps >= self.steps_before_reward:
            reward_dist -= np.linalg.norm(diff)
            # reward_dist = np.exp(-0.1 * diff ** 2).mean()
            # reward_dist = - (diff ** 2).mean()

        reward_ctrl = (action ** 2).sum()
        reward = reward_dist - reward_ctrl
        return reward, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)

    def _get_obs(self):
        theta = self._joint_angles
        return np.hstack([
            np.cos(theta),
            np.sin(theta),
            self._angle_velocity,
            self.end_effector - self._goal,
            self._steps
        ])

    def _generate_goal(self):

        if self.inital_target is None:

            total_length = np.sum(self.link_lengths)
            goal = np.array([total_length, total_length])
            while np.linalg.norm(goal) >= total_length:
                goal = self.np_random.uniform(low=-total_length, high=total_length, size=2)
        else:
            goal = np.copy(self.inital_target)

        self._goal = goal

    def render(self, mode='human'):  # pragma: no cover
        if self.fig is None:
            # Create base figure once on the beginning. Afterwards only update
            plt.ion()
            self.fig = plt.figure()
            ax = self.fig.add_subplot(1, 1, 1)

            # limits
            lim = np.sum(self.link_lengths) + 0.5
            ax.set_xlim([-lim, lim])
            ax.set_ylim([-lim, lim])

            self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k')
            goal_pos = self._goal.T
            self.goal_point, = ax.plot(goal_pos[0], goal_pos[1], 'gx')
            self.goal_dist, = ax.plot([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]], 'g--')

            self.fig.show()

        self.fig.gca().set_title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}")

        # goal
        goal_pos = self._goal.T
        if self._steps == 1:
            self.goal_point.set_data(goal_pos[0], goal_pos[1])

        # arm
        self.line.set_data(self._joints[:, 0], self._joints[:, 1])

        # distance between end effector and goal
        self.goal_dist.set_data([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]])

        self.fig.canvas.draw()
        self.fig.canvas.flush_events()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def close(self):
        del self.fig

    @property
    def end_effector(self):
        return self._joints[self.n_links].T
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`from typing import Iterable, Union`

updated simple reacher example to new structure 2021-06-24 15:26:20 +02:00			`import gym`
added simple reacher task 2020-08-28 18:31:06 +02:00			`import matplotlib.pyplot as plt`
removed EZPickle from SimpleReacher 2020-09-01 17:57:51 +02:00			`import numpy as np`
			`from gym import spaces`
			`from gym.utils import seeding`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00
added simple reacher task 2020-08-28 18:31:06 +02:00
updated simple reacher example to new structure 2021-06-24 15:26:20 +02:00			`class SimpleReacherEnv(gym.Env):`
added simple reacher task 2020-08-28 18:31:06 +02:00			`"""`
			`Simple Reaching Task without any physics simulation.`
			`Returns no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions`
			`towards the end of the trajectory.`
			`"""`

adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`def __init__(self, n_links: int, target: Union[None, Iterable] = None, random_start: bool = True):`
added simple reacher task 2020-08-28 18:31:06 +02:00			`super().__init__()`
			`self.link_lengths = np.ones(n_links)`
			`self.n_links = n_links`
updated simple reacher example to new structure 2021-06-24 15:24:54 +02:00			`self._dt = 0.1`
added simple reacher task 2020-08-28 18:31:06 +02:00
start contextual dmp wrapper 2021-05-07 09:51:53 +02:00			`self.random_start = random_start`

updated envs and registering 2021-06-25 16:16:56 +02:00			`# provided initial parameters`
			`self.inital_target = target`

			`# temp container for current env state`
			`self._goal = None`

fixed some issues with SimpleReacher state space 2020-08-31 10:33:11 +02:00			`self._joints = None`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._joint_angles = None`
added simple reacher task 2020-08-28 18:31:06 +02:00			`self._angle_velocity = None`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._start_pos = np.zeros(self.n_links)`
			`self._start_vel = np.zeros(self.n_links)`

			`self.max_torque = 1`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`self.steps_before_reward = 199`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`action_bound = np.ones((self.n_links,)) * self.max_torque`
added simple reacher task 2020-08-28 18:31:06 +02:00			`state_bound = np.hstack([`
SimpleReacher state space changed 2020-08-31 11:26:32 +02:00			`[np.pi] * self.n_links, # cos`
			`[np.pi] * self.n_links, # sin`
			`[np.inf] * self.n_links, # velocity`
fixed some issues with SimpleReacher state space 2020-08-31 10:33:11 +02:00			`[np.inf] * 2, # x-y coordinates of target distance`
SimpleReacher state space changed 2020-08-31 11:26:32 +02:00			`[np.inf] # env steps, because reward start after n steps TODO: Maybe`
added simple reacher task 2020-08-28 18:31:06 +02:00			`])`
			`self.action_space = spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape)`
			`self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape)`

adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`# containers for plotting`
added simple reacher task 2020-08-28 18:31:06 +02:00			`self.metadata = {'render.modes': ["human"]}`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.fig = None`
added simple reacher task 2020-08-28 18:31:06 +02:00
			`self._steps = 0`
			`self.seed()`

updated simple reacher example to new structure 2021-06-24 15:24:54 +02:00			`@property`
			`def dt(self) -> Union[float, int]:`
			`return self._dt`

updated envs and registering 2021-06-25 16:16:56 +02:00			`@property`
			`def start_pos(self):`
			`return self._start_pos`

removed action scaling 2020-09-08 12:43:14 +02:00			`def step(self, action: np.ndarray):`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`"""`
			`A single step with action in torque space`
			`"""`
added simple reacher task 2020-08-28 18:31:06 +02:00
removed action scaling 2020-09-08 12:43:14 +02:00			`# action = self._add_action_noise(action)`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`ac = np.clip(action, -self.max_torque, self.max_torque)`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._angle_velocity = self._angle_velocity + self.dt * ac`
			`self._joint_angles = self._joint_angles + self.dt * self._angle_velocity`
added simple reacher task 2020-08-28 18:31:06 +02:00			`self._update_joints()`

some changes to reward 2020-08-31 15:51:47 +02:00			`reward, info = self._get_reward(action)`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._steps += 1`
added simple reacher task 2020-08-28 18:31:06 +02:00			`done = False`

some changes to reward 2020-08-31 15:51:47 +02:00			`return self._get_obs().copy(), reward, done, info`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`def reset(self):`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`# TODO: maybe do initialisation more random?`
			`# Sample only orientation of first link, i.e. the arm is always straight.`
			`if self.random_start:`
			`self._joint_angles = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)])`
			`self._start_pos = self._joint_angles.copy()`
			`else:`
			`self._joint_angles = self._start_pos`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._generate_goal()`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._angle_velocity = self._start_vel`
			`self._joints = np.zeros((self.n_links + 1, 2))`
			`self._update_joints()`
			`self._steps = 0`

			`return self._get_obs().copy()`
added simple reacher task 2020-08-28 18:31:06 +02:00
			`def _update_joints(self):`
			`"""`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`update joints to get new end-effector position. The other links are only required for rendering.`
added simple reacher task 2020-08-28 18:31:06 +02:00			`Returns:`

			`"""`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`angles = np.cumsum(self._joint_angles)`
added simple reacher task 2020-08-28 18:31:06 +02:00			`x = self.link_lengths * np.vstack([np.cos(angles), np.sin(angles)])`
fixed some issues with SimpleReacher state space 2020-08-31 10:33:11 +02:00			`self._joints[1:] = self._joints[0] + np.cumsum(x.T, axis=0)`
added simple reacher task 2020-08-28 18:31:06 +02:00
removed action scaling 2020-09-08 12:43:14 +02:00			`def _get_reward(self, action: np.ndarray):`
wip 2021-05-10 12:17:52 +02:00			`diff = self.end_effector - self._goal`
some changes to reward 2020-08-31 15:51:47 +02:00			`reward_dist = 0`
added simple reacher task 2020-08-28 18:31:06 +02:00
SimpleReacher state space changed 2020-08-31 11:26:32 +02:00			`if self._steps >= self.steps_before_reward:`
refractoring of DMP environmets to fit gym interface better. 2021-03-26 14:05:16 +01:00			`reward_dist -= np.linalg.norm(diff)`
some changes to reward 2020-08-31 15:51:47 +02:00			`# reward_dist = np.exp(-0.1 * diff ** 2).mean()`
			`# reward_dist = - (diff ** 2).mean()`
added simple reacher task 2020-08-28 18:31:06 +02:00
some changes to reward 2020-08-31 15:51:47 +02:00			`reward_ctrl = (action ** 2).sum()`
			`reward = reward_dist - reward_ctrl`
			`return reward, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`def _get_obs(self):`
			`theta = self._joint_angles`
			`return np.hstack([`
			`np.cos(theta),`
			`np.sin(theta),`
			`self._angle_velocity,`
			`self.end_effector - self._goal,`
			`self._steps`
			`])`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`def _generate_goal(self):`
added simple reacher task 2020-08-28 18:31:06 +02:00
updated envs and registering 2021-06-25 16:16:56 +02:00			`if self.inital_target is None:`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`total_length = np.sum(self.link_lengths)`
			`goal = np.array([total_length, total_length])`
			`while np.linalg.norm(goal) >= total_length:`
			`goal = self.np_random.uniform(low=-total_length, high=total_length, size=2)`
			`else:`
updated envs and registering 2021-06-25 16:16:56 +02:00			`goal = np.copy(self.inital_target)`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self._goal = goal`
added simple reacher task 2020-08-28 18:31:06 +02:00
			`def render(self, mode='human'): # pragma: no cover`
			`if self.fig is None:`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`# Create base figure once on the beginning. Afterwards only update`
added simple reacher task 2020-08-28 18:31:06 +02:00			`plt.ion()`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.fig = plt.figure()`
			`ax = self.fig.add_subplot(1, 1, 1)`

			`# limits`
			`lim = np.sum(self.link_lengths) + 0.5`
			`ax.set_xlim([-lim, lim])`
			`ax.set_ylim([-lim, lim])`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k')`
			`goal_pos = self._goal.T`
			`self.goal_point, = ax.plot(goal_pos[0], goal_pos[1], 'gx')`
			`self.goal_dist, = ax.plot([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]], 'g--')`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.fig.show()`

			`self.fig.gca().set_title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}")`
added simple reacher task 2020-08-28 18:31:06 +02:00
			`# goal`
wip 2021-05-10 12:17:52 +02:00			`goal_pos = self._goal.T`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`if self._steps == 1:`
			`self.goal_point.set_data(goal_pos[0], goal_pos[1])`

			`# arm`
			`self.line.set_data(self._joints[:, 0], self._joints[:, 1])`

added simple reacher task 2020-08-28 18:31:06 +02:00			`# distance between end effector and goal`
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.goal_dist.set_data([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]])`
added simple reacher task 2020-08-28 18:31:06 +02:00
adjusted classic control environments to new interface 2021-05-12 17:48:57 +02:00			`self.fig.canvas.draw()`
SimpleReacher state space changed 2020-08-31 11:26:32 +02:00			`self.fig.canvas.flush_events()`
added simple reacher task 2020-08-28 18:31:06 +02:00
updated simple reacher example to new structure 2021-06-24 15:24:54 +02:00			`def seed(self, seed=None):`
			`self.np_random, seed = seeding.np_random(seed)`
			`return [seed]`

			`def close(self):`
			`del self.fig`

			`@property`
			`def end_effector(self):`
			`return self._joints[self.n_links].T`