fancy_gym/alr_envs/classic_control/simple_reacher/simple_reacher.py

203 lines
6.3 KiB
Python
Raw Normal View History

from typing import Iterable, Union
import gym
2020-08-28 18:31:06 +02:00
import matplotlib.pyplot as plt
2020-09-01 17:57:51 +02:00
import numpy as np
from gym import spaces
from gym.utils import seeding
2020-08-28 18:31:06 +02:00
class SimpleReacherEnv(gym.Env):
2020-08-28 18:31:06 +02:00
"""
Simple Reaching Task without any physics simulation.
Returns no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions
towards the end of the trajectory.
"""
def __init__(self, n_links: int, target: Union[None, Iterable] = None, random_start: bool = True):
2020-08-28 18:31:06 +02:00
super().__init__()
self.link_lengths = np.ones(n_links)
self.n_links = n_links
self._dt = 0.1
2020-08-28 18:31:06 +02:00
2021-05-07 09:51:53 +02:00
self.random_start = random_start
2021-06-25 16:16:56 +02:00
# provided initial parameters
self.inital_target = target
# temp container for current env state
self._goal = None
self._joints = None
self._joint_angles = None
2020-08-28 18:31:06 +02:00
self._angle_velocity = None
self._start_pos = np.zeros(self.n_links)
self._start_vel = np.zeros(self.n_links)
self.max_torque = 1
self.steps_before_reward = 199
2020-08-28 18:31:06 +02:00
action_bound = np.ones((self.n_links,)) * self.max_torque
2020-08-28 18:31:06 +02:00
state_bound = np.hstack([
2020-08-31 11:26:32 +02:00
[np.pi] * self.n_links, # cos
[np.pi] * self.n_links, # sin
[np.inf] * self.n_links, # velocity
[np.inf] * 2, # x-y coordinates of target distance
2020-08-31 11:26:32 +02:00
[np.inf] # env steps, because reward start after n steps TODO: Maybe
2020-08-28 18:31:06 +02:00
])
self.action_space = spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape)
self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape)
# containers for plotting
2020-08-28 18:31:06 +02:00
self.metadata = {'render.modes': ["human"]}
self.fig = None
2020-08-28 18:31:06 +02:00
self._steps = 0
self.seed()
@property
def dt(self) -> Union[float, int]:
return self._dt
# @property
# def start_pos(self):
# return self._start_pos
@property
def current_pos(self):
return self._joint_angles
2021-06-25 16:16:56 +02:00
@property
def current_vel(self):
return self._angle_velocity
2021-06-25 16:16:56 +02:00
2020-09-08 12:43:14 +02:00
def step(self, action: np.ndarray):
"""
A single step with action in torque space
"""
2020-08-28 18:31:06 +02:00
2020-09-08 12:43:14 +02:00
# action = self._add_action_noise(action)
ac = np.clip(action, -self.max_torque, self.max_torque)
2020-08-28 18:31:06 +02:00
self._angle_velocity = self._angle_velocity + self.dt * ac
self._joint_angles = self._joint_angles + self.dt * self._angle_velocity
2020-08-28 18:31:06 +02:00
self._update_joints()
2020-08-31 15:51:47 +02:00
reward, info = self._get_reward(action)
2020-08-28 18:31:06 +02:00
self._steps += 1
2020-08-28 18:31:06 +02:00
done = False
2020-08-31 15:51:47 +02:00
return self._get_obs().copy(), reward, done, info
2020-08-28 18:31:06 +02:00
def reset(self):
2020-08-28 18:31:06 +02:00
# TODO: maybe do initialisation more random?
# Sample only orientation of first link, i.e. the arm is always straight.
if self.random_start:
self._joint_angles = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)])
self._start_pos = self._joint_angles.copy()
else:
self._joint_angles = self._start_pos
2020-08-28 18:31:06 +02:00
self._generate_goal()
2020-08-28 18:31:06 +02:00
self._angle_velocity = self._start_vel
self._joints = np.zeros((self.n_links + 1, 2))
self._update_joints()
self._steps = 0
return self._get_obs().copy()
2020-08-28 18:31:06 +02:00
def _update_joints(self):
"""
update joints to get new end-effector position. The other links are only required for rendering.
2020-08-28 18:31:06 +02:00
Returns:
"""
angles = np.cumsum(self._joint_angles)
2020-08-28 18:31:06 +02:00
x = self.link_lengths * np.vstack([np.cos(angles), np.sin(angles)])
self._joints[1:] = self._joints[0] + np.cumsum(x.T, axis=0)
2020-08-28 18:31:06 +02:00
2020-09-08 12:43:14 +02:00
def _get_reward(self, action: np.ndarray):
2021-05-10 12:17:52 +02:00
diff = self.end_effector - self._goal
2020-08-31 15:51:47 +02:00
reward_dist = 0
2020-08-28 18:31:06 +02:00
2020-08-31 11:26:32 +02:00
if self._steps >= self.steps_before_reward:
reward_dist -= np.linalg.norm(diff)
2020-08-31 15:51:47 +02:00
# reward_dist = np.exp(-0.1 * diff ** 2).mean()
# reward_dist = - (diff ** 2).mean()
2020-08-28 18:31:06 +02:00
2020-08-31 15:51:47 +02:00
reward_ctrl = (action ** 2).sum()
reward = reward_dist - reward_ctrl
return reward, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
2020-08-28 18:31:06 +02:00
def _get_obs(self):
theta = self._joint_angles
return np.hstack([
np.cos(theta),
np.sin(theta),
self._angle_velocity,
self.end_effector - self._goal,
self._steps
])
2020-08-28 18:31:06 +02:00
def _generate_goal(self):
2020-08-28 18:31:06 +02:00
2021-06-25 16:16:56 +02:00
if self.inital_target is None:
2020-08-28 18:31:06 +02:00
total_length = np.sum(self.link_lengths)
goal = np.array([total_length, total_length])
while np.linalg.norm(goal) >= total_length:
goal = self.np_random.uniform(low=-total_length, high=total_length, size=2)
else:
2021-06-25 16:16:56 +02:00
goal = np.copy(self.inital_target)
2020-08-28 18:31:06 +02:00
self._goal = goal
2020-08-28 18:31:06 +02:00
def render(self, mode='human'): # pragma: no cover
if self.fig is None:
# Create base figure once on the beginning. Afterwards only update
2020-08-28 18:31:06 +02:00
plt.ion()
self.fig = plt.figure()
ax = self.fig.add_subplot(1, 1, 1)
# limits
lim = np.sum(self.link_lengths) + 0.5
ax.set_xlim([-lim, lim])
ax.set_ylim([-lim, lim])
2020-08-28 18:31:06 +02:00
self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k')
goal_pos = self._goal.T
self.goal_point, = ax.plot(goal_pos[0], goal_pos[1], 'gx')
self.goal_dist, = ax.plot([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]], 'g--')
2020-08-28 18:31:06 +02:00
self.fig.show()
self.fig.gca().set_title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}")
2020-08-28 18:31:06 +02:00
# goal
2021-05-10 12:17:52 +02:00
goal_pos = self._goal.T
if self._steps == 1:
self.goal_point.set_data(goal_pos[0], goal_pos[1])
# arm
self.line.set_data(self._joints[:, 0], self._joints[:, 1])
2020-08-28 18:31:06 +02:00
# distance between end effector and goal
self.goal_dist.set_data([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]])
2020-08-28 18:31:06 +02:00
self.fig.canvas.draw()
2020-08-31 11:26:32 +02:00
self.fig.canvas.flush_events()
2020-08-28 18:31:06 +02:00
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def close(self):
del self.fig
@property
def end_effector(self):
return self._joints[self.n_links].T