fancy_gym/alr_envs/classic_control/simple_reacher.py

232 lines
7.5 KiB
Python
Raw Normal View History

from typing import Iterable, Union
2020-08-28 18:31:06 +02:00
import matplotlib.pyplot as plt
2020-09-01 17:57:51 +02:00
import numpy as np
from gym import spaces
from gym.utils import seeding
from alr_envs.utils.mps.mp_environments import MPEnv
2020-08-28 18:31:06 +02:00
class SimpleReacherEnv(MPEnv):
2020-08-28 18:31:06 +02:00
"""
Simple Reaching Task without any physics simulation.
Returns no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions
towards the end of the trajectory.
"""
def __init__(self, n_links: int, target: Union[None, Iterable] = None, random_start: bool = True):
2020-08-28 18:31:06 +02:00
super().__init__()
self.link_lengths = np.ones(n_links)
self.n_links = n_links
self.dt = 0.1
2021-05-07 09:51:53 +02:00
self.random_start = random_start
self._joints = None
self._joint_angles = None
2020-08-28 18:31:06 +02:00
self._angle_velocity = None
self._start_pos = np.zeros(self.n_links)
self._start_vel = np.zeros(self.n_links)
self._target = target # provided target value
self._goal = None # updated goal value, does not change when target != None
2020-08-28 18:31:06 +02:00
self.max_torque = 1
self.steps_before_reward = 199
2020-08-28 18:31:06 +02:00
action_bound = np.ones((self.n_links,)) * self.max_torque
2020-08-28 18:31:06 +02:00
state_bound = np.hstack([
2020-08-31 11:26:32 +02:00
[np.pi] * self.n_links, # cos
[np.pi] * self.n_links, # sin
[np.inf] * self.n_links, # velocity
[np.inf] * 2, # x-y coordinates of target distance
2020-08-31 11:26:32 +02:00
[np.inf] # env steps, because reward start after n steps TODO: Maybe
2020-08-28 18:31:06 +02:00
])
self.action_space = spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape)
self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape)
# containers for plotting
2020-08-28 18:31:06 +02:00
self.metadata = {'render.modes': ["human"]}
self.fig = None
2020-08-28 18:31:06 +02:00
self._steps = 0
self.seed()
2020-09-08 12:43:14 +02:00
def step(self, action: np.ndarray):
"""
A single step with action in torque space
"""
2020-08-28 18:31:06 +02:00
2020-09-08 12:43:14 +02:00
# action = self._add_action_noise(action)
ac = np.clip(action, -self.max_torque, self.max_torque)
2020-08-28 18:31:06 +02:00
self._angle_velocity = self._angle_velocity + self.dt * ac
self._joint_angles = self._joint_angles + self.dt * self._angle_velocity
2020-08-28 18:31:06 +02:00
self._update_joints()
2020-08-31 15:51:47 +02:00
reward, info = self._get_reward(action)
2020-08-28 18:31:06 +02:00
self._steps += 1
2020-08-28 18:31:06 +02:00
done = False
2020-08-31 15:51:47 +02:00
return self._get_obs().copy(), reward, done, info
2020-08-28 18:31:06 +02:00
def reset(self):
2020-08-28 18:31:06 +02:00
# TODO: maybe do initialisation more random?
# Sample only orientation of first link, i.e. the arm is always straight.
if self.random_start:
self._joint_angles = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)])
self._start_pos = self._joint_angles.copy()
else:
self._joint_angles = self._start_pos
2020-08-28 18:31:06 +02:00
self._generate_goal()
2020-08-28 18:31:06 +02:00
self._angle_velocity = self._start_vel
self._joints = np.zeros((self.n_links + 1, 2))
self._update_joints()
self._steps = 0
return self._get_obs().copy()
2020-08-28 18:31:06 +02:00
def _update_joints(self):
"""
update joints to get new end-effector position. The other links are only required for rendering.
2020-08-28 18:31:06 +02:00
Returns:
"""
angles = np.cumsum(self._joint_angles)
2020-08-28 18:31:06 +02:00
x = self.link_lengths * np.vstack([np.cos(angles), np.sin(angles)])
self._joints[1:] = self._joints[0] + np.cumsum(x.T, axis=0)
2020-08-28 18:31:06 +02:00
2020-09-08 12:43:14 +02:00
def _get_reward(self, action: np.ndarray):
2021-05-10 12:17:52 +02:00
diff = self.end_effector - self._goal
2020-08-31 15:51:47 +02:00
reward_dist = 0
2020-08-28 18:31:06 +02:00
2020-08-31 11:26:32 +02:00
if self._steps >= self.steps_before_reward:
reward_dist -= np.linalg.norm(diff)
2020-08-31 15:51:47 +02:00
# reward_dist = np.exp(-0.1 * diff ** 2).mean()
# reward_dist = - (diff ** 2).mean()
2020-08-28 18:31:06 +02:00
2020-08-31 15:51:47 +02:00
reward_ctrl = (action ** 2).sum()
reward = reward_dist - reward_ctrl
return reward, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
2020-08-28 18:31:06 +02:00
def _get_obs(self):
theta = self._joint_angles
return np.hstack([
np.cos(theta),
np.sin(theta),
self._angle_velocity,
self.end_effector - self._goal,
self._steps
])
2020-08-28 18:31:06 +02:00
def _generate_goal(self):
2020-08-28 18:31:06 +02:00
if self._target is None:
# center = self._joints[0]
# # Sample uniformly in circle with radius R around center of reacher.
# R = np.sum(self.link_lengths)
# r = R * np.sqrt(self.np_random.uniform())
# theta = self.np_random.uniform() * 2 * np.pi
# goal = center + r * np.stack([np.cos(theta), np.sin(theta)])
2020-08-28 18:31:06 +02:00
total_length = np.sum(self.link_lengths)
goal = np.array([total_length, total_length])
while np.linalg.norm(goal) >= total_length:
goal = self.np_random.uniform(low=-total_length, high=total_length, size=2)
else:
goal = np.copy(self._target)
2020-08-28 18:31:06 +02:00
self._goal = goal
2020-08-28 18:31:06 +02:00
def render(self, mode='human'): # pragma: no cover
if self.fig is None:
# Create base figure once on the beginning. Afterwards only update
2020-08-28 18:31:06 +02:00
plt.ion()
self.fig = plt.figure()
ax = self.fig.add_subplot(1, 1, 1)
# limits
lim = np.sum(self.link_lengths) + 0.5
ax.set_xlim([-lim, lim])
ax.set_ylim([-lim, lim])
2020-08-28 18:31:06 +02:00
self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k')
goal_pos = self._goal.T
self.goal_point, = ax.plot(goal_pos[0], goal_pos[1], 'gx')
self.goal_dist, = ax.plot([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]], 'g--')
2020-08-28 18:31:06 +02:00
self.fig.show()
self.fig.gca().set_title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}")
2020-08-28 18:31:06 +02:00
# goal
2021-05-10 12:17:52 +02:00
goal_pos = self._goal.T
if self._steps == 1:
self.goal_point.set_data(goal_pos[0], goal_pos[1])
# arm
self.line.set_data(self._joints[:, 0], self._joints[:, 1])
2020-08-28 18:31:06 +02:00
# distance between end effector and goal
self.goal_dist.set_data([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]])
2020-08-28 18:31:06 +02:00
self.fig.canvas.draw()
2020-08-31 11:26:32 +02:00
self.fig.canvas.flush_events()
2020-08-28 18:31:06 +02:00
@property
def active_obs(self):
return np.hstack([
[self.random_start] * self.n_links, # cos
[self.random_start] * self.n_links, # sin
[self.random_start] * self.n_links, # velocity
[True] * 2, # x-y coordinates of target distance
[False] # env steps
])
@property
def start_pos(self):
return self._start_pos
@property
def goal_pos(self):
raise ValueError("Goal position is not available and has to be learnt based on the environment.")
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
2020-08-28 18:31:06 +02:00
def close(self):
del self.fig
@property
def end_effector(self):
return self._joints[self.n_links].T
if __name__ == '__main__':
nl = 5
render_mode = "human" # "human" or "partial" or "final"
env = SimpleReacherEnv(n_links=nl)
obs = env.reset()
print("First", obs)
for i in range(2000):
# objective.load_result("/tmp/cma")
# test with random actions
ac = 2 * env.action_space.sample()
# ac = np.ones(env.action_space.shape)
obs, rew, d, info = env.step(ac)
env.render(mode=render_mode)
print(obs[env.active_obs].shape)
if d or i % 200 == 0:
env.reset()
env.close()