Merge pull request #6 from ALRhub/contextual_dmp_wrapper

Contextual dmp wrapper + environments
This commit is contained in:
ottofabian 2021-05-18 10:54:19 +02:00 committed by GitHub
commit e0e4d6d41c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 723 additions and 1097 deletions

View File

@ -1,7 +1,8 @@
from gym.envs.registration import register from gym.envs.registration import register
from alr_envs.stochastic_search.functions.f_rosenbrock import Rosenbrock from alr_envs.stochastic_search.functions.f_rosenbrock import Rosenbrock
# from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper
# from alr_envs.utils.mps.dmp_wrapper import DmpWrapper
# Mujoco # Mujoco
@ -71,6 +72,17 @@ register(
} }
) )
## Balancing Reacher
register(
id='Balancing-v0',
entry_point='alr_envs.mujoco:BalancingEnv',
max_episode_steps=200,
kwargs={
"n_links": 5,
}
)
register( register(
id='ALRBallInACupSimple-v0', id='ALRBallInACupSimple-v0',
entry_point='alr_envs.mujoco:ALRBallInACupEnv', entry_point='alr_envs.mujoco:ALRBallInACupEnv',
@ -101,15 +113,7 @@ register(
# Classic control # Classic control
register( ## Simple Reacher
id='Balancing-v0',
entry_point='alr_envs.mujoco:BalancingEnv',
max_episode_steps=200,
kwargs={
"n_links": 5,
}
)
register( register(
id='SimpleReacher-v0', id='SimpleReacher-v0',
entry_point='alr_envs.classic_control:SimpleReacherEnv', entry_point='alr_envs.classic_control:SimpleReacherEnv',
@ -119,6 +123,16 @@ register(
} }
) )
register(
id='SimpleReacher-v1',
entry_point='alr_envs.classic_control:SimpleReacherEnv',
max_episode_steps=200,
kwargs={
"n_links": 2,
"random_start": False
}
)
register( register(
id='LongSimpleReacher-v0', id='LongSimpleReacher-v0',
entry_point='alr_envs.classic_control:SimpleReacherEnv', entry_point='alr_envs.classic_control:SimpleReacherEnv',
@ -128,6 +142,18 @@ register(
} }
) )
register(
id='LongSimpleReacher-v1',
entry_point='alr_envs.classic_control:SimpleReacherEnv',
max_episode_steps=200,
kwargs={
"n_links": 5,
"random_start": False
}
)
## Viapoint Reacher
register( register(
id='ViaPointReacher-v0', id='ViaPointReacher-v0',
entry_point='alr_envs.classic_control.viapoint_reacher:ViaPointReacher', entry_point='alr_envs.classic_control.viapoint_reacher:ViaPointReacher',
@ -139,14 +165,47 @@ register(
} }
) )
## Hole Reacher
register( register(
id='HoleReacher-v0', id='HoleReacher-v0',
entry_point='alr_envs.classic_control.hole_reacher:HoleReacher', entry_point='alr_envs.classic_control.hole_reacher:HoleReacherEnv',
max_episode_steps=200, max_episode_steps=200,
kwargs={ kwargs={
"n_links": 5, "n_links": 5,
"allow_self_collision": False, "allow_self_collision": False,
"allow_wall_collision": False, "allow_wall_collision": False,
"hole_width": None,
"hole_depth": 1,
"hole_x": None,
"collision_penalty": 100,
}
)
register(
id='HoleReacher-v1',
entry_point='alr_envs.classic_control.hole_reacher:HoleReacherEnv',
max_episode_steps=200,
kwargs={
"n_links": 5,
"random_start": False,
"allow_self_collision": False,
"allow_wall_collision": False,
"hole_width": None,
"hole_depth": 1,
"hole_x": None,
"collision_penalty": 100,
}
)
register(
id='HoleReacher-v2',
entry_point='alr_envs.classic_control.hole_reacher:HoleReacherEnv',
max_episode_steps=200,
kwargs={
"n_links": 5,
"random_start": False,
"allow_self_collision": False,
"allow_wall_collision": False,
"hole_width": 0.25, "hole_width": 0.25,
"hole_depth": 1, "hole_depth": 1,
"hole_x": 2, "hole_x": 2,
@ -155,6 +214,25 @@ register(
) )
# MP environments # MP environments
## Simple Reacher
versions = ["SimpleReacher-v0", "SimpleReacher-v1", "LongSimpleReacher-v0", "LongSimpleReacher-v1"]
for v in versions:
name = v.split("-")
register(
id=f'{name[0]}DMP-{name[1]}',
entry_point='alr_envs.utils.make_env_helpers:make_dmp_env',
# max_episode_steps=1,
kwargs={
"name": f"alr_envs:{v}",
"num_dof": 2 if "long" not in v.lower() else 5,
"num_basis": 5,
"duration": 2,
"alpha_phase": 2,
"learn_goal": True,
"policy_type": "velocity",
"weights_scale": 50,
}
)
register( register(
id='ViaPointReacherDMP-v0', id='ViaPointReacherDMP-v0',
@ -172,12 +250,15 @@ register(
} }
) )
## Hole Reacher
versions = ["v0", "v1", "v2"]
for v in versions:
register( register(
id='HoleReacherDMP-v0', id=f'HoleReacherDMP-{v}',
entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', entry_point='alr_envs.utils.make_env_helpers:make_dmp_env',
# max_episode_steps=1, # max_episode_steps=1,
kwargs={ kwargs={
"name": "alr_envs:HoleReacher-v0", "name": f"alr_envs:HoleReacher-{v}",
"num_dof": 5, "num_dof": 5,
"num_basis": 5, "num_basis": 5,
"duration": 2, "duration": 2,
@ -190,6 +271,13 @@ register(
} }
) )
# register(
# id='HoleReacherDetPMP-v0',
# entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp',
# # max_episode_steps=1,
# # TODO: add mp kwargs
# )
# TODO: properly add final_pos # TODO: properly add final_pos
register( register(
id='HoleReacherFixedGoalDMP-v0', id='HoleReacherFixedGoalDMP-v0',
@ -208,12 +296,7 @@ register(
} }
) )
# register( ## Ball in Cup
# id='HoleReacherDetPMP-v0',
# entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp',
# # max_episode_steps=1,
# # TODO: add mp kwargs
# )
register( register(
id='ALRBallInACupSimpleDMP-v0', id='ALRBallInACupSimpleDMP-v0',

View File

@ -1,3 +1,3 @@
from alr_envs.classic_control.simple_reacher import SimpleReacherEnv from alr_envs.classic_control.simple_reacher import SimpleReacherEnv
from alr_envs.classic_control.viapoint_reacher import ViaPointReacher from alr_envs.classic_control.viapoint_reacher import ViaPointReacher
from alr_envs.classic_control.hole_reacher import HoleReacher from alr_envs.classic_control.hole_reacher import HoleReacherEnv

View File

@ -1,27 +1,36 @@
from typing import Union
import gym import gym
import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np
from gym.utils import seeding
from matplotlib import patches from matplotlib import patches
from alr_envs.classic_control.utils import check_self_collision from alr_envs.classic_control.utils import check_self_collision
from alr_envs.utils.mps.mp_environments import MPEnv
class HoleReacher(gym.Env): class HoleReacherEnv(MPEnv):
def __init__(self, n_links, hole_x, hole_width, hole_depth, allow_self_collision=False, def __init__(self, n_links: int, hole_x: Union[None, float] = None, hole_depth: Union[None, float] = None,
allow_wall_collision=False, collision_penalty=1000): hole_width: float = 1., random_start: bool = False, allow_self_collision: bool = False,
allow_wall_collision: bool = False, collision_penalty: bool = 1000):
self.n_links = n_links self.n_links = n_links
self.link_lengths = np.ones((n_links, 1)) self.link_lengths = np.ones((n_links, 1))
# task self.random_start = random_start
self.hole_x = hole_x # x-position of center of hole
self.hole_width = hole_width # width of hole
self.hole_depth = hole_depth # depth of hole
self.bottom_center_of_hole = np.hstack([hole_x, -hole_depth]) # provided initial parameters
self.top_center_of_hole = np.hstack([hole_x, 0]) self._hole_x = hole_x # x-position of center of hole
self.left_wall_edge = np.hstack([hole_x - self.hole_width / 2, 0]) self._hole_width = hole_width # width of hole
self.right_wall_edge = np.hstack([hole_x + self.hole_width / 2, 0]) self._hole_depth = hole_depth # depth of hole
# temp container for current env state
self._tmp_hole_x = None
self._tmp_hole_width = None
self._tmp_hole_depth = None
self._goal = None # x-y coordinates for reaching the center at the bottom of the hole
# collision # collision
self.allow_self_collision = allow_self_collision self.allow_self_collision = allow_self_collision
@ -32,91 +41,77 @@ class HoleReacher(gym.Env):
self._joints = None self._joints = None
self._joint_angles = None self._joint_angles = None
self._angle_velocity = None self._angle_velocity = None
self.start_pos = np.hstack([[np.pi / 2], np.zeros(self.n_links - 1)]) self._start_pos = np.hstack([[np.pi / 2], np.zeros(self.n_links - 1)])
self.start_vel = np.zeros(self.n_links) self._start_vel = np.zeros(self.n_links)
self.dt = 0.01 self.dt = 0.01
# self.time_limit = 2
action_bound = np.pi * np.ones((self.n_links,)) action_bound = np.pi * np.ones((self.n_links,))
state_bound = np.hstack([ state_bound = np.hstack([
[np.pi] * self.n_links, # cos [np.pi] * self.n_links, # cos
[np.pi] * self.n_links, # sin [np.pi] * self.n_links, # sin
[np.inf] * self.n_links, # velocity [np.inf] * self.n_links, # velocity
[np.inf], # hole width
[np.inf], # hole depth
[np.inf] * 2, # x-y coordinates of target distance [np.inf] * 2, # x-y coordinates of target distance
[np.inf] # env steps, because reward start after n steps TODO: Maybe [np.inf] # env steps, because reward start after n steps TODO: Maybe
]) ])
self.action_space = gym.spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape) self.action_space = gym.spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape)
self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape)
# containers for plotting
self.metadata = {'render.modes': ["human", "partial"]}
self.fig = None self.fig = None
rect_1 = patches.Rectangle((-self.n_links, -1),
self.n_links + self.hole_x - self.hole_width / 2, 1,
fill=True, edgecolor='k', facecolor='k')
rect_2 = patches.Rectangle((self.hole_x + self.hole_width / 2, -1),
self.n_links - self.hole_x + self.hole_width / 2, 1,
fill=True, edgecolor='k', facecolor='k')
rect_3 = patches.Rectangle((self.hole_x - self.hole_width / 2, -1), self.hole_width,
1 - self.hole_depth,
fill=True, edgecolor='k', facecolor='k')
self.patches = [rect_1, rect_2, rect_3] self._steps = 0
self.seed()
@property def step(self, action: np.ndarray):
def end_effector(self): """
return self._joints[self.n_links].T A single step with an action in joint velocity space
"""
def configure(self, context): self._angle_velocity = action
pass self._joint_angles = self._joint_angles + self.dt * self._angle_velocity
self._update_joints()
acc = (action - self._angle_velocity) / self.dt
reward, info = self._get_reward(acc)
info.update({"is_collided": self._is_collided})
self._steps += 1
done = self._is_collided
return self._get_obs().copy(), reward, done, info
def reset(self): def reset(self):
self._joint_angles = self.start_pos if self.random_start:
self._angle_velocity = self.start_vel # Maybe change more than dirst seed
first_joint = self.np_random.uniform(np.pi / 4, 3 * np.pi / 4)
self._joint_angles = np.hstack([[first_joint], np.zeros(self.n_links - 1)])
self._start_pos = self._joint_angles.copy()
else:
self._joint_angles = self._start_pos
self._generate_hole()
self._set_patches()
self._angle_velocity = self._start_vel
self._joints = np.zeros((self.n_links + 1, 2)) self._joints = np.zeros((self.n_links + 1, 2))
self._update_joints() self._update_joints()
self._steps = 0 self._steps = 0
return self._get_obs().copy() return self._get_obs().copy()
def step(self, action: np.ndarray): def _generate_hole(self):
""" self._tmp_hole_x = self.np_random.uniform(0.5, 3.5, 1) if self._hole_x is None else np.copy(self._hole_x)
a single step with an action in joint velocity space self._tmp_hole_width = self.np_random.uniform(0.5, 0.1, 1) if self._hole_width is None else np.copy(
""" self._hole_width)
vel = action # + 0.01 * np.random.randn(self.num_links) # TODO we do not want this right now.
acc = (vel - self._angle_velocity) / self.dt self._tmp_hole_depth = self.np_random.uniform(1, 1, 1) if self._hole_depth is None else np.copy(
self._angle_velocity = vel self._hole_depth)
self._joint_angles = self._joint_angles + self.dt * self._angle_velocity self._goal = np.hstack([self._tmp_hole_x, -self._tmp_hole_depth])
self._update_joints()
# rew = self._reward()
# compute reward directly in step function
success = False
reward = 0
if not self._is_collided:
if self._steps == 199:
dist = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
reward = - dist ** 2
success = dist < 0.005
else:
dist = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
# if self.collision_penalty != 0:
# reward = -self.collision_penalty
# else:
reward = - dist ** 2 - self.collision_penalty
reward -= 5e-8 * np.sum(acc ** 2)
info = {"is_collided": self._is_collided, "is_success": success}
self._steps += 1
# done = self._steps * self.dt > self.time_limit or self._is_collided
done = self._is_collided
return self._get_obs().copy(), reward, done, info
def _update_joints(self): def _update_joints(self):
""" """
@ -124,7 +119,7 @@ class HoleReacher(gym.Env):
Returns: Returns:
""" """
line_points_in_taskspace = self.get_forward_kinematics(num_points_per_link=20) line_points_in_taskspace = self._get_forward_kinematics(num_points_per_link=20)
self._joints[1:, 0] = self._joints[0, 0] + line_points_in_taskspace[:, -1, 0] self._joints[1:, 0] = self._joints[0, 0] + line_points_in_taskspace[:, -1, 0]
self._joints[1:, 1] = self._joints[0, 1] + line_points_in_taskspace[:, -1, 1] self._joints[1:, 1] = self._joints[0, 1] + line_points_in_taskspace[:, -1, 1]
@ -138,48 +133,65 @@ class HoleReacher(gym.Env):
self_collision = True self_collision = True
if not self.allow_wall_collision: if not self.allow_wall_collision:
wall_collision = self.check_wall_collision(line_points_in_taskspace) wall_collision = self._check_wall_collision(line_points_in_taskspace)
self._is_collided = self_collision or wall_collision self._is_collided = self_collision or wall_collision
def _get_reward(self, acc: np.ndarray):
success = False
reward = -np.inf
if not self._is_collided:
dist = 0
# return reward only in last time step
if self._steps == 199:
dist = np.linalg.norm(self.end_effector - self._goal)
success = dist < 0.005
else:
# Episode terminates when colliding, hence return reward
dist = np.linalg.norm(self.end_effector - self._goal)
reward = -self.collision_penalty
reward -= dist ** 2
reward -= 5e-8 * np.sum(acc ** 2)
info = {"is_success": success}
return reward, info
def _get_obs(self): def _get_obs(self):
theta = self._joint_angles theta = self._joint_angles
return np.hstack([ return np.hstack([
np.cos(theta), np.cos(theta),
np.sin(theta), np.sin(theta),
self._angle_velocity, self._angle_velocity,
self.end_effector - self.bottom_center_of_hole, self._tmp_hole_width,
self._tmp_hole_depth,
self.end_effector - self._goal,
self._steps self._steps
]) ])
def get_forward_kinematics(self, num_points_per_link=1): def _get_forward_kinematics(self, num_points_per_link=1):
theta = self._joint_angles[:, None] theta = self._joint_angles[:, None]
if num_points_per_link > 1: intermediate_points = np.linspace(0, 1, num_points_per_link) if num_points_per_link > 1 else 1
intermediate_points = np.linspace(0, 1, num_points_per_link)
else:
intermediate_points = 1
accumulated_theta = np.cumsum(theta, axis=0) accumulated_theta = np.cumsum(theta, axis=0)
end_effector = np.zeros(shape=(self.n_links, num_points_per_link, 2))
endeffector = np.zeros(shape=(self.n_links, num_points_per_link, 2))
x = np.cos(accumulated_theta) * self.link_lengths * intermediate_points x = np.cos(accumulated_theta) * self.link_lengths * intermediate_points
y = np.sin(accumulated_theta) * self.link_lengths * intermediate_points y = np.sin(accumulated_theta) * self.link_lengths * intermediate_points
endeffector[0, :, 0] = x[0, :] end_effector[0, :, 0] = x[0, :]
endeffector[0, :, 1] = y[0, :] end_effector[0, :, 1] = y[0, :]
for i in range(1, self.n_links): for i in range(1, self.n_links):
endeffector[i, :, 0] = x[i, :] + endeffector[i - 1, -1, 0] end_effector[i, :, 0] = x[i, :] + end_effector[i - 1, -1, 0]
endeffector[i, :, 1] = y[i, :] + endeffector[i - 1, -1, 1] end_effector[i, :, 1] = y[i, :] + end_effector[i - 1, -1, 1]
return np.squeeze(endeffector + self._joints[0, :]) return np.squeeze(end_effector + self._joints[0, :])
def check_wall_collision(self, line_points): def _check_wall_collision(self, line_points):
# all points that are before the hole in x # all points that are before the hole in x
r, c = np.where(line_points[:, :, 0] < (self.hole_x - self.hole_width / 2)) r, c = np.where(line_points[:, :, 0] < (self._tmp_hole_x - self._tmp_hole_width / 2))
# check if any of those points are below surface # check if any of those points are below surface
nr_line_points_below_surface_before_hole = np.sum(line_points[r, c, 1] < 0) nr_line_points_below_surface_before_hole = np.sum(line_points[r, c, 1] < 0)
@ -188,7 +200,7 @@ class HoleReacher(gym.Env):
return True return True
# all points that are after the hole in x # all points that are after the hole in x
r, c = np.where(line_points[:, :, 0] > (self.hole_x + self.hole_width / 2)) r, c = np.where(line_points[:, :, 0] > (self._tmp_hole_x + self._tmp_hole_width / 2))
# check if any of those points are below surface # check if any of those points are below surface
nr_line_points_below_surface_after_hole = np.sum(line_points[r, c, 1] < 0) nr_line_points_below_surface_after_hole = np.sum(line_points[r, c, 1] < 0)
@ -197,11 +209,11 @@ class HoleReacher(gym.Env):
return True return True
# all points that are above the hole # all points that are above the hole
r, c = np.where((line_points[:, :, 0] > (self.hole_x - self.hole_width / 2)) & ( r, c = np.where((line_points[:, :, 0] > (self._tmp_hole_x - self._tmp_hole_width / 2)) & (
line_points[:, :, 0] < (self.hole_x + self.hole_width / 2))) line_points[:, :, 0] < (self._tmp_hole_x + self._tmp_hole_width / 2)))
# check if any of those points are below surface # check if any of those points are below surface
nr_line_points_below_surface_in_hole = np.sum(line_points[r, c, 1] < -self.hole_depth) nr_line_points_below_surface_in_hole = np.sum(line_points[r, c, 1] < -self._tmp_hole_depth)
if nr_line_points_below_surface_in_hole > 0: if nr_line_points_below_surface_in_hole > 0:
return True return True
@ -210,61 +222,85 @@ class HoleReacher(gym.Env):
def render(self, mode='human'): def render(self, mode='human'):
if self.fig is None: if self.fig is None:
# Create base figure once on the beginning. Afterwards only update
plt.ion()
self.fig = plt.figure() self.fig = plt.figure()
# plt.ion() ax = self.fig.add_subplot(1, 1, 1)
# plt.pause(0.01)
else: # limits
plt.figure(self.fig.number) lim = np.sum(self.link_lengths) + 0.5
ax.set_xlim([-lim, lim])
ax.set_ylim([-1.1, lim])
self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k')
self._set_patches()
self.fig.show()
self.fig.gca().set_title(
f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}")
if mode == "human": if mode == "human":
plt.cla()
plt.title(f"Iteration: {self._steps}, distance: {self.end_effector - self.bottom_center_of_hole}")
# Arm # arm
plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') self.line.set_data(self._joints[:, 0], self._joints[:, 1])
# Add the patch to the Axes self.fig.canvas.draw()
[plt.gca().add_patch(rect) for rect in self.patches] self.fig.canvas.flush_events()
lim = np.sum(self.link_lengths) + 0.5
plt.xlim([-lim, lim])
plt.ylim([-1.1, lim])
# plt.draw()
plt.pause(1e-4) # pushes window to foreground, which is annoying.
# self.fig.canvas.flush_events()
elif mode == "partial": elif mode == "partial":
if self._steps == 1:
# fig, ax = plt.subplots()
# Add the patch to the Axes
[plt.gca().add_patch(rect) for rect in self.patches]
# plt.pause(0.01)
if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided: if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided:
# Arm # Arm
plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k', alpha=self._steps / 200) plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k',
# ax.plot(line_points_in_taskspace[:, 0, 0], alpha=self._steps / 200)
# line_points_in_taskspace[:, 0, 1],
# line_points_in_taskspace[:, -1, 0],
# line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200)
lim = np.sum(self.link_lengths) + 0.5 def _set_patches(self):
plt.xlim([-lim, lim]) if self.fig is not None:
plt.ylim([-1.1, lim]) self.fig.gca().patches = []
plt.pause(0.01) left_block = patches.Rectangle((-self.n_links, -self._tmp_hole_depth),
self.n_links + self._tmp_hole_x - self._tmp_hole_width / 2,
elif mode == "final": self._tmp_hole_depth,
if self._steps == 199 or self._is_collided: fill=True, edgecolor='k', facecolor='k')
# fig, ax = plt.subplots() right_block = patches.Rectangle((self._tmp_hole_x + self._tmp_hole_width / 2, -self._tmp_hole_depth),
self.n_links - self._tmp_hole_x + self._tmp_hole_width / 2,
self._tmp_hole_depth,
fill=True, edgecolor='k', facecolor='k')
hole_floor = patches.Rectangle((self._tmp_hole_x - self._tmp_hole_width / 2, -self._tmp_hole_depth),
self._tmp_hole_width,
1 - self._tmp_hole_depth,
fill=True, edgecolor='k', facecolor='k')
# Add the patch to the Axes # Add the patch to the Axes
[plt.gca().add_patch(rect) for rect in self.patches] self.fig.gca().add_patch(left_block)
self.fig.gca().add_patch(right_block)
self.fig.gca().add_patch(hole_floor)
plt.xlim(-self.n_links, self.n_links), plt.ylim(-1, self.n_links) @property
# Arm def active_obs(self):
plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') return np.hstack([
[self.random_start] * self.n_links, # cos
[self.random_start] * self.n_links, # sin
[self.random_start] * self.n_links, # velocity
[self._hole_width is None], # hole width
[self._hole_depth is None], # hole width
[True] * 2, # x-y coordinates of target distance
[False] # env steps
])
plt.pause(0.01) @property
def start_pos(self) -> Union[float, int, np.ndarray]:
return self._start_pos
@property
def goal_pos(self) -> Union[float, int, np.ndarray]:
raise ValueError("Goal position is not available and has to be learnt based on the environment.")
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
@property
def end_effector(self):
return self._joints[self.n_links].T
def close(self): def close(self):
if self.fig is not None: if self.fig is not None:
@ -274,22 +310,20 @@ class HoleReacher(gym.Env):
if __name__ == '__main__': if __name__ == '__main__':
nl = 5 nl = 5
render_mode = "human" # "human" or "partial" or "final" render_mode = "human" # "human" or "partial" or "final"
env = HoleReacher(n_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, env = HoleReacherEnv(n_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=None,
hole_depth=1, hole_x=1) hole_depth=1, hole_x=None)
env.reset() obs = env.reset()
# env.render(mode=render_mode)
for i in range(200): for i in range(200):
# objective.load_result("/tmp/cma") # objective.load_result("/tmp/cma")
# test with random actions # test with random actions
ac = 2 * env.action_space.sample() ac = 2 * env.action_space.sample()
# ac[0] += np.pi/2
obs, rew, d, info = env.step(ac) obs, rew, d, info = env.step(ac)
env.render(mode=render_mode) env.render(mode=render_mode)
print(rew) print(rew)
if d: if d:
break env.reset()
env.close() env.close()

View File

@ -1,39 +1,41 @@
import gym from typing import Iterable, Union
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
from gym import spaces from gym import spaces
from gym.utils import seeding from gym.utils import seeding
from alr_envs.utils.utils import angle_normalize from alr_envs.utils.mps.mp_environments import MPEnv
# if os.environ.get("DISPLAY", None): class SimpleReacherEnv(MPEnv):
# mpl.use('Qt5Agg')
class SimpleReacherEnv(gym.Env):
""" """
Simple Reaching Task without any physics simulation. Simple Reaching Task without any physics simulation.
Returns no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions Returns no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions
towards the end of the trajectory. towards the end of the trajectory.
""" """
def __init__(self, n_links): def __init__(self, n_links: int, target: Union[None, Iterable] = None, random_start: bool = True):
super().__init__() super().__init__()
self.link_lengths = np.ones(n_links) self.link_lengths = np.ones(n_links)
self.n_links = n_links self.n_links = n_links
self.dt = 0.1 self.dt = 0.1
self._goal_pos = None self.random_start = random_start
self._joints = None self._joints = None
self._joint_angle = None self._joint_angles = None
self._angle_velocity = None self._angle_velocity = None
self._start_pos = np.zeros(self.n_links)
self._start_vel = np.zeros(self.n_links)
self.max_torque = 1 # 10 self._target = target # provided target value
self._goal = None # updated goal value, does not change when target != None
self.max_torque = 1
self.steps_before_reward = 199 self.steps_before_reward = 199
action_bound = np.ones((self.n_links,)) action_bound = np.ones((self.n_links,)) * self.max_torque
state_bound = np.hstack([ state_bound = np.hstack([
[np.pi] * self.n_links, # cos [np.pi] * self.n_links, # cos
[np.pi] * self.n_links, # sin [np.pi] * self.n_links, # sin
@ -44,49 +46,50 @@ class SimpleReacherEnv(gym.Env):
self.action_space = spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape) self.action_space = spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape)
self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) self.observation_space = spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape)
self.fig = None # containers for plotting
self.metadata = {'render.modes': ["human"]} self.metadata = {'render.modes': ["human"]}
self.fig = None
self._steps = 0 self._steps = 0
self.seed() self.seed()
def step(self, action: np.ndarray): def step(self, action: np.ndarray):
"""
A single step with action in torque space
"""
# action = self._add_action_noise(action) # action = self._add_action_noise(action)
action = np.clip(action, -self.max_torque, self.max_torque) ac = np.clip(action, -self.max_torque, self.max_torque)
self._angle_velocity = self._angle_velocity + self.dt * action self._angle_velocity = self._angle_velocity + self.dt * ac
self._joint_angle = angle_normalize(self._joint_angle + self.dt * self._angle_velocity) self._joint_angles = self._joint_angles + self.dt * self._angle_velocity
self._update_joints() self._update_joints()
self._steps += 1
reward, info = self._get_reward(action) reward, info = self._get_reward(action)
# done = np.abs(self.end_effector - self._goal_pos) < 0.1 self._steps += 1
done = False done = False
return self._get_obs().copy(), reward, done, info return self._get_obs().copy(), reward, done, info
def _add_action_noise(self, action: np.ndarray): def reset(self):
"""
add unobserved Gaussian Noise N(0,0.01) to the actions
Args:
action:
Returns: actions with noise # TODO: maybe do initialisation more random?
# Sample only orientation of first link, i.e. the arm is always straight.
if self.random_start:
self._joint_angles = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)])
self._start_pos = self._joint_angles.copy()
else:
self._joint_angles = self._start_pos
""" self._generate_goal()
return self.np_random.normal(0, 0.1, *action.shape) + action
def _get_obs(self): self._angle_velocity = self._start_vel
theta = self._joint_angle self._joints = np.zeros((self.n_links + 1, 2))
return np.hstack([ self._update_joints()
np.cos(theta), self._steps = 0
np.sin(theta),
self._angle_velocity, return self._get_obs().copy()
self.end_effector - self._goal_pos,
self._steps
])
def _update_joints(self): def _update_joints(self):
""" """
@ -94,15 +97,14 @@ class SimpleReacherEnv(gym.Env):
Returns: Returns:
""" """
angles = np.cumsum(self._joint_angle) angles = np.cumsum(self._joint_angles)
x = self.link_lengths * np.vstack([np.cos(angles), np.sin(angles)]) x = self.link_lengths * np.vstack([np.cos(angles), np.sin(angles)])
self._joints[1:] = self._joints[0] + np.cumsum(x.T, axis=0) self._joints[1:] = self._joints[0] + np.cumsum(x.T, axis=0)
def _get_reward(self, action: np.ndarray): def _get_reward(self, action: np.ndarray):
diff = self.end_effector - self._goal_pos diff = self.end_effector - self._goal
reward_dist = 0 reward_dist = 0
# TODO: Is this the best option
if self._steps >= self.steps_before_reward: if self._steps >= self.steps_before_reward:
reward_dist -= np.linalg.norm(diff) reward_dist -= np.linalg.norm(diff)
# reward_dist = np.exp(-0.1 * diff ** 2).mean() # reward_dist = np.exp(-0.1 * diff ** 2).mean()
@ -112,62 +114,112 @@ class SimpleReacherEnv(gym.Env):
reward = reward_dist - reward_ctrl reward = reward_dist - reward_ctrl
return reward, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl) return reward, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
def reset(self): def _get_obs(self):
theta = self._joint_angles
return np.hstack([
np.cos(theta),
np.sin(theta),
self._angle_velocity,
self.end_effector - self._goal,
self._steps
])
# TODO: maybe do initialisation more random? def _generate_goal(self):
# Sample only orientation of first link, i.e. the arm is always straight.
self._joint_angle = np.hstack([[self.np_random.uniform(-np.pi, np.pi)], np.zeros(self.n_links - 1)])
self._angle_velocity = np.zeros(self.n_links)
self._joints = np.zeros((self.n_links + 1, 2))
self._update_joints()
self._steps = 0
self._goal_pos = self._get_random_goal() if self._target is None:
return self._get_obs().copy()
def _get_random_goal(self): total_length = np.sum(self.link_lengths)
center = self._joints[0] goal = np.array([total_length, total_length])
while np.linalg.norm(goal) >= total_length:
goal = self.np_random.uniform(low=-total_length, high=total_length, size=2)
else:
goal = np.copy(self._target)
# Sample uniformly in circle with radius R around center of reacher. self._goal = goal
R = np.sum(self.link_lengths)
r = R * np.sqrt(self.np_random.uniform()) def render(self, mode='human'): # pragma: no cover
theta = self.np_random.uniform() * 2 * np.pi if self.fig is None:
return center + r * np.stack([np.cos(theta), np.sin(theta)]) # Create base figure once on the beginning. Afterwards only update
plt.ion()
self.fig = plt.figure()
ax = self.fig.add_subplot(1, 1, 1)
# limits
lim = np.sum(self.link_lengths) + 0.5
ax.set_xlim([-lim, lim])
ax.set_ylim([-lim, lim])
self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k')
goal_pos = self._goal.T
self.goal_point, = ax.plot(goal_pos[0], goal_pos[1], 'gx')
self.goal_dist, = ax.plot([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]], 'g--')
self.fig.show()
self.fig.gca().set_title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}")
# goal
goal_pos = self._goal.T
if self._steps == 1:
self.goal_point.set_data(goal_pos[0], goal_pos[1])
# arm
self.line.set_data(self._joints[:, 0], self._joints[:, 1])
# distance between end effector and goal
self.goal_dist.set_data([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]])
self.fig.canvas.draw()
self.fig.canvas.flush_events()
@property
def active_obs(self):
return np.hstack([
[self.random_start] * self.n_links, # cos
[self.random_start] * self.n_links, # sin
[self.random_start] * self.n_links, # velocity
[True] * 2, # x-y coordinates of target distance
[False] # env steps
])
@property
def start_pos(self):
return self._start_pos
@property
def goal_pos(self):
raise ValueError("Goal position is not available and has to be learnt based on the environment.")
def seed(self, seed=None): def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed) self.np_random, seed = seeding.np_random(seed)
return [seed] return [seed]
def render(self, mode='human'): # pragma: no cover
if self.fig is None:
self.fig = plt.figure()
plt.ion()
plt.show()
else:
plt.figure(self.fig.number)
plt.cla()
plt.title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal_pos}")
# Arm
plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k')
# goal
goal_pos = self._goal_pos.T
plt.plot(goal_pos[0], goal_pos[1], 'gx')
# distance between end effector and goal
plt.plot([self.end_effector[0], goal_pos[0]], [self.end_effector[1], goal_pos[1]], 'g--')
lim = np.sum(self.link_lengths) + 0.5
plt.xlim([-lim, lim])
plt.ylim([-lim, lim])
# plt.draw()
# plt.pause(1e-4) pushes window to foreground, which is annoying.
self.fig.canvas.flush_events()
def close(self): def close(self):
del self.fig del self.fig
@property @property
def end_effector(self): def end_effector(self):
return self._joints[self.n_links].T return self._joints[self.n_links].T
if __name__ == '__main__':
nl = 5
render_mode = "human" # "human" or "partial" or "final"
env = SimpleReacherEnv(n_links=nl)
obs = env.reset()
print("First", obs)
for i in range(2000):
# objective.load_result("/tmp/cma")
# test with random actions
ac = 2 * env.action_space.sample()
# ac = np.ones(env.action_space.shape)
obs, rew, d, info = env.step(ac)
env.render(mode=render_mode)
print(obs[env.active_obs].shape)
if d or i % 200 == 0:
env.reset()
env.close()

View File

@ -1,19 +1,31 @@
from typing import Iterable, Union
import gym import gym
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
from gym.utils import seeding
from alr_envs.classic_control.utils import check_self_collision from alr_envs.classic_control.utils import check_self_collision
from alr_envs.utils.mps.mp_environments import MPEnv
class ViaPointReacher(gym.Env): class ViaPointReacher(MPEnv):
def __init__(self, n_links, allow_self_collision=False, collision_penalty=1000): def __init__(self, n_links, random_start: bool = True, via_target: Union[None, Iterable] = None,
self.num_links = n_links target: Union[None, Iterable] = None, allow_self_collision=False, collision_penalty=1000):
self.n_links = n_links
self.link_lengths = np.ones((n_links, 1)) self.link_lengths = np.ones((n_links, 1))
# task self.random_start = random_start
self.via_point = np.ones(2)
self.goal_point = np.array((n_links, 0)) # provided initial parameters
self._target = target # provided target value
self._via_target = via_target # provided via point target value
# temp container for current env state
self._via_point = np.ones(2)
self._goal = np.array((n_links, 0))
# collision # collision
self.allow_self_collision = allow_self_collision self.allow_self_collision = allow_self_collision
@ -23,78 +35,93 @@ class ViaPointReacher(gym.Env):
self._joints = None self._joints = None
self._joint_angles = None self._joint_angles = None
self._angle_velocity = None self._angle_velocity = None
self.start_pos = np.hstack([[np.pi / 2], np.zeros(self.num_links - 1)]) self._start_pos = np.hstack([[np.pi / 2], np.zeros(self.n_links - 1)])
self.start_vel = np.zeros(self.num_links) self._start_vel = np.zeros(self.n_links)
self.weight_matrix_scale = 1 self.weight_matrix_scale = 1
self._steps = 0
self.dt = 0.01 self.dt = 0.01
# self.time_limit = 2
action_bound = np.pi * np.ones((self.num_links,)) action_bound = np.pi * np.ones((self.n_links,))
state_bound = np.hstack([ state_bound = np.hstack([
[np.pi] * self.num_links, # cos [np.pi] * self.n_links, # cos
[np.pi] * self.num_links, # sin [np.pi] * self.n_links, # sin
[np.inf] * self.num_links, # velocity [np.inf] * self.n_links, # velocity
[np.inf] * 2, # x-y coordinates of via point distance
[np.inf] * 2, # x-y coordinates of target distance [np.inf] * 2, # x-y coordinates of target distance
[np.inf] # env steps, because reward start after n steps TODO: Maybe [np.inf] # env steps, because reward start after n steps
]) ])
self.action_space = gym.spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape) self.action_space = gym.spaces.Box(low=-action_bound, high=action_bound, shape=action_bound.shape)
self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape)
# containers for plotting
self.metadata = {'render.modes': ["human", "partial"]}
self.fig = None self.fig = None
@property
def end_effector(self):
return self._joints[self.num_links].T
def configure(self, context):
pass
def reset(self):
self._joint_angles = self.start_pos
self._angle_velocity = self.start_vel
self._joints = np.zeros((self.num_links + 1, 2))
self._update_joints()
self._steps = 0 self._steps = 0
self.seed()
return self._get_obs().copy()
def step(self, action: np.ndarray): def step(self, action: np.ndarray):
""" """
a single step with an action in joint velocity space a single step with an action in joint velocity space
""" """
vel = action vel = action
acc = (vel - self._angle_velocity) / self.dt
self._angle_velocity = vel self._angle_velocity = vel
self._joint_angles = self._joint_angles + self.dt * self._angle_velocity self._joint_angles = self._joint_angles + self.dt * self._angle_velocity
self._update_joints() self._update_joints()
dist_reward = 0 acc = (vel - self._angle_velocity) / self.dt
if not self._is_collided: reward, info = self._get_reward(acc)
if self._steps == 100:
dist_reward = np.linalg.norm(self.end_effector - self.via_point)
elif self._steps == 199:
dist_reward = np.linalg.norm(self.end_effector - self.goal_point)
# TODO: Do we need that? info.update({"is_collided": self._is_collided})
reward = - dist_reward ** 2
reward -= 5e-8 * np.sum(acc**2)
if self._is_collided:
reward -= self.collision_penalty
info = {"is_collided": self._is_collided}
self._steps += 1 self._steps += 1
# done = self._steps * self.dt > self.time_limit or self._is_collided
done = self._is_collided done = self._is_collided
return self._get_obs().copy(), reward, done, info return self._get_obs().copy(), reward, done, info
def reset(self):
if self.random_start:
# Maybe change more than dirst seed
first_joint = self.np_random.uniform(np.pi / 4, 3 * np.pi / 4)
self._joint_angles = np.hstack([[first_joint], np.zeros(self.n_links - 1)])
self._start_pos = self._joint_angles.copy()
else:
self._joint_angles = self._start_pos
self._generate_goal()
self._angle_velocity = self._start_vel
self._joints = np.zeros((self.n_links + 1, 2))
self._update_joints()
self._steps = 0
return self._get_obs().copy()
def _generate_goal(self):
# TODO: Maybe improve this later, this can yield quite a lot of invalid settings
total_length = np.sum(self.link_lengths)
# rejection sampled point in inner circle with 0.5*Radius
if self._via_target is None:
via_target = np.array([total_length, total_length])
while np.linalg.norm(via_target) >= 0.5 * total_length:
via_target = self.np_random.uniform(low=-0.5 * total_length, high=0.5 * total_length, size=2)
else:
via_target = np.copy(self._via_target)
# rejection sampled point in outer circle
if self._target is None:
goal = np.array([total_length, total_length])
while np.linalg.norm(goal) >= total_length or np.linalg.norm(goal) <= 0.5 * total_length:
goal = self.np_random.uniform(low=-total_length, high=total_length, size=2)
else:
goal = np.copy(self._target)
self._via_target = via_target
self._goal = goal
def _update_joints(self): def _update_joints(self):
""" """
update _joints to get new end effector position. The other links are only required for rendering. update _joints to get new end effector position. The other links are only required for rendering.
@ -115,14 +142,38 @@ class ViaPointReacher(gym.Env):
self._is_collided = self_collision self._is_collided = self_collision
def _get_reward(self, acc):
success = False
reward = -np.inf
if not self._is_collided:
dist = np.inf
# return intermediate reward for via point
if self._steps == 100:
dist = np.linalg.norm(self.end_effector - self._via_point)
# return reward in last time step for goal
elif self._steps == 199:
dist = np.linalg.norm(self.end_effector - self._goal)
success = dist < 0.005
else:
# Episode terminates when colliding, hence return reward
dist = np.linalg.norm(self.end_effector - self._goal)
reward = -self.collision_penalty
reward -= dist ** 2
reward -= 5e-8 * np.sum(acc ** 2)
info = {"is_success": success}
return reward, info
def _get_obs(self): def _get_obs(self):
theta = self._joint_angles theta = self._joint_angles
return np.hstack([ return np.hstack([
np.cos(theta), np.cos(theta),
np.sin(theta), np.sin(theta),
self._angle_velocity, self._angle_velocity,
self.end_effector - self.via_point, self.end_effector - self._via_point,
self.end_effector - self.goal_point, self.end_effector - self._goal,
self._steps self._steps
]) ])
@ -133,7 +184,7 @@ class ViaPointReacher(gym.Env):
accumulated_theta = np.cumsum(theta, axis=0) accumulated_theta = np.cumsum(theta, axis=0)
endeffector = np.zeros(shape=(self.num_links, num_points_per_link, 2)) endeffector = np.zeros(shape=(self.n_links, num_points_per_link, 2))
x = np.cos(accumulated_theta) * self.link_lengths * intermediate_points x = np.cos(accumulated_theta) * self.link_lengths * intermediate_points
y = np.sin(accumulated_theta) * self.link_lengths * intermediate_points y = np.sin(accumulated_theta) * self.link_lengths * intermediate_points
@ -141,33 +192,46 @@ class ViaPointReacher(gym.Env):
endeffector[0, :, 0] = x[0, :] endeffector[0, :, 0] = x[0, :]
endeffector[0, :, 1] = y[0, :] endeffector[0, :, 1] = y[0, :]
for i in range(1, self.num_links): for i in range(1, self.n_links):
endeffector[i, :, 0] = x[i, :] + endeffector[i - 1, -1, 0] endeffector[i, :, 0] = x[i, :] + endeffector[i - 1, -1, 0]
endeffector[i, :, 1] = y[i, :] + endeffector[i - 1, -1, 1] endeffector[i, :, 1] = y[i, :] + endeffector[i - 1, -1, 1]
return np.squeeze(endeffector + self._joints[0, :]) return np.squeeze(endeffector + self._joints[0, :])
def render(self, mode='human'): def render(self, mode='human'):
goal_pos = self._goal.T
via_pos = self._via_point.T
if self.fig is None: if self.fig is None:
# Create base figure once on the beginning. Afterwards only update
plt.ion()
self.fig = plt.figure() self.fig = plt.figure()
# plt.ion() ax = self.fig.add_subplot(1, 1, 1)
# plt.pause(0.01)
else: # limits
plt.figure(self.fig.number) lim = np.sum(self.link_lengths) + 0.5
ax.set_xlim([-lim, lim])
ax.set_ylim([-lim, lim])
self.line, = ax.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k')
self.goal_point_plot, = ax.plot(goal_pos[0], goal_pos[1], 'go')
self.via_point_plot, = ax.plot(via_pos[0], via_pos[1], 'gx')
self.fig.show()
self.fig.gca().set_title(f"Iteration: {self._steps}, distance: {self.end_effector - self._goal}")
if mode == "human": if mode == "human":
plt.cla() # goal
plt.title(f"Iteration: {self._steps}") if self._steps == 1:
self.goal_point_plot.set_data(goal_pos[0], goal_pos[1])
self.via_point_plot.set_data(via_pos[0], goal_pos[1])
# Arm # arm
plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') self.line.set_data(self._joints[:, 0], self._joints[:, 1])
lim = np.sum(self.link_lengths) + 0.5 self.fig.canvas.draw()
plt.xlim([-lim, lim]) self.fig.canvas.flush_events()
plt.ylim([-lim, lim])
# plt.draw()
plt.pause(1e-4) # pushes window to foreground, which is annoying.
# self.fig.canvas.flush_events()
elif mode == "partial": elif mode == "partial":
if self._steps == 1: if self._steps == 1:
@ -196,12 +260,39 @@ class ViaPointReacher(gym.Env):
# Add the patch to the Axes # Add the patch to the Axes
[plt.gca().add_patch(rect) for rect in self.patches] [plt.gca().add_patch(rect) for rect in self.patches]
plt.xlim(-self.num_links, self.num_links), plt.ylim(-1, self.num_links) plt.xlim(-self.n_links, self.n_links), plt.ylim(-1, self.n_links)
# Arm # Arm
plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k')
plt.pause(0.01) plt.pause(0.01)
@property
def active_obs(self):
return np.hstack([
[self.random_start] * self.n_links, # cos
[self.random_start] * self.n_links, # sin
[self.random_start] * self.n_links, # velocity
[self._via_target is None] * 2, # x-y coordinates of via point distance
[True] * 2, # x-y coordinates of target distance
[False] # env steps
])
@property
def start_pos(self) -> Union[float, int, np.ndarray]:
return self._start_pos
@property
def goal_pos(self) -> Union[float, int, np.ndarray]:
raise ValueError("Goal position is not available and has to be learnt based on the environment.")
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
@property
def end_effector(self):
return self._joints[self.n_links].T
def close(self): def close(self):
if self.fig is not None: if self.fig is not None:
plt.close(self.fig) plt.close(self.fig)

View File

@ -1,5 +1,5 @@
from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper from alr_envs.utils.mps.detpmp_wrapper import DetPMPWrapper
from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper from alr_envs.utils.mps.dmp_wrapper import DmpWrapper
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv
@ -17,19 +17,8 @@ def make_contextual_env(rank, seed=0):
def _init(): def _init():
env = ALRBallInACupEnv(reward_type="contextual_goal") env = ALRBallInACupEnv(reward_type="contextual_goal")
env = DetPMPWrapper(env, env = DetPMPWrapper(env, num_dof=7, num_basis=5, width=0.005, duration=3.5, dt=env.dt, post_traj_time=4.5,
num_dof=7, policy_type="motor", weights_scale=0.5, zero_start=True, zero_goal=True)
num_basis=5,
width=0.005,
policy_type="motor",
start_pos=env.start_pos,
duration=3.5,
post_traj_time=4.5,
dt=env.dt,
weights_scale=0.5,
zero_start=True,
zero_goal=True
)
env.seed(seed + rank) env.seed(seed + rank)
return env return env
@ -51,19 +40,8 @@ def make_env(rank, seed=0):
def _init(): def _init():
env = ALRBallInACupEnv(reward_type="simple") env = ALRBallInACupEnv(reward_type="simple")
env = DetPMPWrapper(env, env = DetPMPWrapper(env, num_dof=7, num_basis=5, width=0.005, duration=3.5, dt=env.dt, post_traj_time=4.5,
num_dof=7, policy_type="motor", weights_scale=0.2, zero_start=True, zero_goal=True)
num_basis=5,
width=0.005,
policy_type="motor",
start_pos=env.start_pos,
duration=3.5,
post_traj_time=4.5,
dt=env.dt,
weights_scale=0.2,
zero_start=True,
zero_goal=True
)
env.seed(seed + rank) env.seed(seed + rank)
return env return env
@ -85,20 +63,8 @@ def make_simple_env(rank, seed=0):
def _init(): def _init():
env = ALRBallInACupEnv(reward_type="simple") env = ALRBallInACupEnv(reward_type="simple")
env = DetPMPWrapper(env, env = DetPMPWrapper(env, num_dof=3, num_basis=5, width=0.005, duration=3.5, dt=env.dt, post_traj_time=4.5,
num_dof=3, policy_type="motor", weights_scale=0.25, zero_start=True, zero_goal=True, off=-0.1)
num_basis=5,
width=0.005,
off=-0.1,
policy_type="motor",
start_pos=env.start_pos[1::2],
duration=3.5,
post_traj_time=4.5,
dt=env.dt,
weights_scale=0.25,
zero_start=True,
zero_goal=True
)
env.seed(seed + rank) env.seed(seed + rank)
return env return env

View File

@ -1,4 +1,4 @@
from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper from alr_envs.utils.mps.detpmp_wrapper import DetPMPWrapper
from alr_envs.mujoco.beerpong.beerpong import ALRBeerpongEnv from alr_envs.mujoco.beerpong.beerpong import ALRBeerpongEnv
from alr_envs.mujoco.beerpong.beerpong_simple import ALRBeerpongEnv as ALRBeerpongEnvSimple from alr_envs.mujoco.beerpong.beerpong_simple import ALRBeerpongEnv as ALRBeerpongEnvSimple
@ -17,19 +17,8 @@ def make_contextual_env(rank, seed=0):
def _init(): def _init():
env = ALRBeerpongEnv() env = ALRBeerpongEnv()
env = DetPMPWrapper(env, env = DetPMPWrapper(env, num_dof=7, num_basis=5, width=0.005, duration=3.5, dt=env.dt, post_traj_time=4.5,
num_dof=7, policy_type="motor", weights_scale=0.5, zero_start=True, zero_goal=True)
num_basis=5,
width=0.005,
policy_type="motor",
start_pos=env.start_pos,
duration=3.5,
post_traj_time=4.5,
dt=env.dt,
weights_scale=0.5,
zero_start=True,
zero_goal=True
)
env.seed(seed + rank) env.seed(seed + rank)
return env return env
@ -51,19 +40,8 @@ def make_env(rank, seed=0):
def _init(): def _init():
env = ALRBeerpongEnvSimple() env = ALRBeerpongEnvSimple()
env = DetPMPWrapper(env, env = DetPMPWrapper(env, num_dof=7, num_basis=5, width=0.005, duration=3.5, dt=env.dt, post_traj_time=4.5,
num_dof=7, policy_type="motor", weights_scale=0.25, zero_start=True, zero_goal=True)
num_basis=5,
width=0.005,
policy_type="motor",
start_pos=env.start_pos,
duration=3.5,
post_traj_time=4.5,
dt=env.dt,
weights_scale=0.25,
zero_start=True,
zero_goal=True
)
env.seed(seed + rank) env.seed(seed + rank)
return env return env
@ -85,19 +63,8 @@ def make_simple_env(rank, seed=0):
def _init(): def _init():
env = ALRBeerpongEnvSimple() env = ALRBeerpongEnvSimple()
env = DetPMPWrapper(env, env = DetPMPWrapper(env, num_dof=3, num_basis=5, width=0.005, duration=3.5, dt=env.dt, post_traj_time=4.5,
num_dof=3, policy_type="motor", weights_scale=0.5, zero_start=True, zero_goal=True)
num_basis=5,
width=0.005,
policy_type="motor",
start_pos=env.start_pos[1::2],
duration=3.5,
post_traj_time=4.5,
dt=env.dt,
weights_scale=0.5,
zero_start=True,
zero_goal=True
)
env.seed(seed + rank) env.seed(seed + rank)
return env return env

View File

@ -2,12 +2,12 @@ import os
import numpy as np import numpy as np
from gym import utils from gym import utils
from gym.envs.mujoco import mujoco_env from gym.envs.mujoco import MujocoEnv
import alr_envs.utils.utils as alr_utils import alr_envs.utils.utils as alr_utils
class ALRReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle): class ALRReacherEnv(MujocoEnv, utils.EzPickle):
def __init__(self, steps_before_reward=200, n_links=5, balance=False): def __init__(self, steps_before_reward=200, n_links=5, balance=False):
utils.EzPickle.__init__(**locals()) utils.EzPickle.__init__(**locals())
@ -31,7 +31,7 @@ class ALRReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle):
else: else:
raise ValueError(f"Invalid number of links {n_links}, only 5 or 7 allowed.") raise ValueError(f"Invalid number of links {n_links}, only 5 or 7 allowed.")
mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", file_name), 2) MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", file_name), 2)
def step(self, a): def step(self, a):
self._steps += 1 self._steps += 1

View File

@ -1,88 +0,0 @@
from alr_envs.utils.policies import get_policy_class
from mp_lib import det_promp
import numpy as np
import gym
class DetPMPEnvWrapper(gym.Wrapper):
def __init__(self,
env,
num_dof,
num_basis,
width,
off=0.01,
start_pos=None,
duration=1,
dt=0.01,
post_traj_time=0.,
policy_type=None,
weights_scale=1,
zero_start=False,
zero_goal=False,
):
super(DetPMPEnvWrapper, self).__init__(env)
self.num_dof = num_dof
self.num_basis = num_basis
self.dim = num_dof * num_basis
self.pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=off,
zero_start=zero_start, zero_goal=zero_goal)
weights = np.zeros(shape=(num_basis, num_dof))
self.pmp.set_weights(duration, weights)
self.weights_scale = weights_scale
self.duration = duration
self.dt = dt
self.post_traj_steps = int(post_traj_time / dt)
self.start_pos = start_pos
self.zero_start = zero_start
policy_class = get_policy_class(policy_type)
self.policy = policy_class(env)
def __call__(self, params, contexts=None):
params = np.atleast_2d(params)
rewards = []
infos = []
for p, c in zip(params, contexts):
reward, info = self.rollout(p, c)
rewards.append(reward)
infos.append(info)
return np.array(rewards), infos
def rollout(self, params, context=None, render=False):
""" This function generates a trajectory based on a DMP and then does the usual loop over reset and step"""
params = np.reshape(params, newshape=(self.num_basis, self.num_dof)) * self.weights_scale
self.pmp.set_weights(self.duration, params)
t, des_pos, des_vel, des_acc = self.pmp.compute_trajectory(1 / self.dt, 1.)
if self.zero_start:
des_pos += self.start_pos[None, :]
if self.post_traj_steps > 0:
des_pos = np.vstack([des_pos, np.tile(des_pos[-1, :], [self.post_traj_steps, 1])])
des_vel = np.vstack([des_vel, np.zeros(shape=(self.post_traj_steps, self.num_dof))])
self._trajectory = des_pos
self._velocity = des_vel
rews = []
infos = []
self.env.configure(context)
self.env.reset()
for t, pos_vel in enumerate(zip(des_pos, des_vel)):
ac = self.policy.get_action(pos_vel[0], pos_vel[1])
obs, rew, done, info = self.env.step(ac)
rews.append(rew)
infos.append(info)
if render:
self.env.render(mode="human")
if done:
break
reward = np.sum(rews)
return reward, info

View File

@ -1,173 +0,0 @@
import gym
from gym.error import (AlreadyPendingCallError, NoAsyncCallError)
from gym.vector.utils import concatenate, create_empty_array
from gym.vector.async_vector_env import AsyncState
import numpy as np
import multiprocessing as mp
import sys
def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
assert shared_memory is None
env = env_fn()
parent_pipe.close()
try:
while True:
command, data = pipe.recv()
if command == 'reset':
observation = env.reset()
pipe.send((observation, True))
elif command == 'step':
observation, reward, done, info = env.step(data)
if done:
observation = env.reset()
pipe.send(((observation, reward, done, info), True))
elif command == 'rollout':
rewards = []
infos = []
for p, c in zip(*data):
reward, info = env.rollout(p, c)
rewards.append(reward)
infos.append(info)
pipe.send(((rewards, infos), (True, ) * len(rewards)))
elif command == 'seed':
env.seed(data)
pipe.send((None, True))
elif command == 'close':
env.close()
pipe.send((None, True))
break
elif command == 'idle':
pipe.send((None, True))
elif command == '_check_observation_space':
pipe.send((data == env.observation_space, True))
else:
raise RuntimeError('Received unknown command `{0}`. Must '
'be one of {`reset`, `step`, `seed`, `close`, '
'`_check_observation_space`}.'.format(command))
except (KeyboardInterrupt, Exception):
error_queue.put((index,) + sys.exc_info()[:2])
pipe.send((None, False))
finally:
env.close()
class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
def __init__(self, env_fns, n_samples, observation_space=None, action_space=None,
shared_memory=False, copy=True, context="spawn", daemon=True, worker=_worker):
super(DmpAsyncVectorEnv, self).__init__(env_fns,
observation_space=observation_space,
action_space=action_space,
shared_memory=shared_memory,
copy=copy,
context=context,
daemon=daemon,
worker=worker)
# we need to overwrite the number of samples as we may sample more than num_envs
self.observations = create_empty_array(self.single_observation_space,
n=n_samples,
fn=np.zeros)
def __call__(self, params, contexts=None):
return self.rollout(params, contexts)
def rollout_async(self, params, contexts):
"""
Parameters
----------
params : iterable of samples from `action_space`
List of actions.
"""
self._assert_is_running()
if self._state != AsyncState.DEFAULT:
raise AlreadyPendingCallError('Calling `rollout_async` while waiting '
'for a pending call to `{0}` to complete.'.format(
self._state.value), self._state.value)
params = np.atleast_2d(params)
split_params = np.array_split(params, np.minimum(len(params), self.num_envs))
if contexts is None:
split_contexts = np.array_split([None, ] * len(params), np.minimum(len(params), self.num_envs))
else:
split_contexts = np.array_split(contexts, np.minimum(len(contexts), self.num_envs))
assert np.all([len(p) == len(c) for p, c in zip(split_params, split_contexts)])
for pipe, param, context in zip(self.parent_pipes, split_params, split_contexts):
pipe.send(('rollout', (param, context)))
for pipe in self.parent_pipes[len(split_params):]:
pipe.send(('idle', None))
self._state = AsyncState.WAITING_ROLLOUT
def rollout_wait(self, timeout=None):
"""
Parameters
----------
timeout : int or float, optional
Number of seconds before the call to `step_wait` times out. If
`None`, the call to `step_wait` never times out.
Returns
-------
observations : sample from `observation_space`
A batch of observations from the vectorized environment.
rewards : `np.ndarray` instance (dtype `np.float_`)
A vector of rewards from the vectorized environment.
dones : `np.ndarray` instance (dtype `np.bool_`)
A vector whose entries indicate whether the episode has ended.
infos : list of dict
A list of auxiliary diagnostic information.
"""
self._assert_is_running()
if self._state != AsyncState.WAITING_ROLLOUT:
raise NoAsyncCallError('Calling `rollout_wait` without any prior call '
'to `rollout_async`.', AsyncState.WAITING_ROLLOUT.value)
if not self._poll(timeout):
self._state = AsyncState.DEFAULT
raise mp.TimeoutError('The call to `rollout_wait` has timed out after '
'{0} second{1}.'.format(timeout, 's' if timeout > 1 else ''))
results, successes = zip(*[pipe.recv() for pipe in self.parent_pipes])
results = [r for r in results if r is not None]
self._raise_if_errors(successes)
self._state = AsyncState.DEFAULT
rewards, infos = [_flatten_list(r) for r in zip(*results)]
# for now, we ignore the observations and only return the rewards
# if not self.shared_memory:
# self.observations = concatenate(observations_list, self.observations,
# self.single_observation_space)
# return (deepcopy(self.observations) if self.copy else self.observations,
# np.array(rewards), np.array(dones, dtype=np.bool_), infos)
return np.array(rewards), infos
def rollout(self, actions, contexts):
self.rollout_async(actions, contexts)
return self.rollout_wait()
def _flatten_obs(obs):
assert isinstance(obs, (list, tuple))
assert len(obs) > 0
if isinstance(obs[0], dict):
keys = obs[0].keys()
return {k: np.stack([o[k] for o in obs]) for k in keys}
else:
return np.stack(obs)
def _flatten_list(l):
assert isinstance(l, (list, tuple))
assert len(l) > 0
assert all([len(l_) > 0 for l_ in l])
return [l__ for l_ in l for l__ in l_]

View File

@ -1,125 +0,0 @@
from alr_envs.utils.policies import get_policy_class
from mp_lib.phase import ExpDecayPhaseGenerator
from mp_lib.basis import DMPBasisGenerator
from mp_lib import dmps
import numpy as np
import gym
class DmpEnvWrapper(gym.Wrapper):
def __init__(self,
env,
num_dof,
num_basis,
start_pos=None,
final_pos=None,
duration=1,
dt=0.01,
alpha_phase=2,
bandwidth_factor=3,
learn_goal=False,
post_traj_time=0.,
policy_type=None,
weights_scale=1.,
goal_scale=1.,
):
super(DmpEnvWrapper, self).__init__(env)
self.num_dof = num_dof
self.num_basis = num_basis
self.dim = num_dof * num_basis
if learn_goal:
self.dim += num_dof
self.learn_goal = learn_goal
self.duration = duration # seconds
time_steps = int(duration / dt)
self.t = np.linspace(0, duration, time_steps)
self.post_traj_steps = int(post_traj_time / dt)
phase_generator = ExpDecayPhaseGenerator(alpha_phase=alpha_phase, duration=duration)
basis_generator = DMPBasisGenerator(phase_generator,
duration=duration,
num_basis=self.num_basis,
basis_bandwidth_factor=bandwidth_factor)
self.dmp = dmps.DMP(num_dof=num_dof,
basis_generator=basis_generator,
phase_generator=phase_generator,
num_time_steps=time_steps,
dt=dt
)
self.dmp.dmp_start_pos = start_pos.reshape((1, num_dof))
dmp_weights = np.zeros((num_basis, num_dof))
if learn_goal:
dmp_goal_pos = np.zeros(num_dof)
else:
dmp_goal_pos = final_pos
self.dmp.set_weights(dmp_weights, dmp_goal_pos)
self.weights_scale = weights_scale
self.goal_scale = goal_scale
policy_class = get_policy_class(policy_type)
self.policy = policy_class(env)
def __call__(self, params, contexts=None):
params = np.atleast_2d(params)
rewards = []
infos = []
for p, c in zip(params, contexts):
reward, info = self.rollout(p, c)
rewards.append(reward)
infos.append(info)
return np.array(rewards), infos
def goal_and_weights(self, params):
if len(params.shape) > 1:
assert params.shape[1] == self.dim
else:
assert len(params) == self.dim
params = np.reshape(params, [1, self.dim])
if self.learn_goal:
goal_pos = params[0, -self.num_dof:]
weight_matrix = np.reshape(params[:, :-self.num_dof], [self.num_basis, self.num_dof])
else:
goal_pos = self.dmp.dmp_goal_pos.flatten()
assert goal_pos is not None
weight_matrix = np.reshape(params, [self.num_basis, self.num_dof])
return goal_pos * self.goal_scale, weight_matrix * self.weights_scale
def rollout(self, params, context=None, render=False):
""" This function generates a trajectory based on a DMP and then does the usual loop over reset and step"""
goal_pos, weight_matrix = self.goal_and_weights(params)
self.dmp.set_weights(weight_matrix, goal_pos)
trajectory, velocity = self.dmp.reference_trajectory(self.t)
if self.post_traj_steps > 0:
trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.num_dof))])
self._trajectory = trajectory
self._velocity = velocity
rews = []
infos = []
self.env.configure(context)
self.env.reset()
for t, pos_vel in enumerate(zip(trajectory, velocity)):
ac = self.policy.get_action(pos_vel[0], pos_vel[1])
obs, rew, done, info = self.env.step(ac)
rews.append(rew)
infos.append(info)
if render:
self.env.render(mode="human")
if done:
break
reward = np.sum(rews)
return reward, info

View File

@ -1,28 +0,0 @@
from alr_envs.utils.legacy.utils import make_holereacher_env
import numpy as np
if __name__ == "__main__":
n_samples = 1
n_cpus = 4
dim = 30
# env = DmpAsyncVectorEnv([make_viapointreacher_env(i) for i in range(n_cpus)],
# n_samples=n_samples)
test_env = make_holereacher_env(0)()
# params = np.random.randn(n_samples, dim)
params = np.array([[1.386102, -3.29980525, 4.70402733, 1.3966668, 0.73774902,
3.14676681, -4.98644416, 6.20303193, 1.30502127, -0.09330522,
7.62656797, -5.76893033, 3.4706711, -0.6944142, -3.33442788,
12.31421548, -0.72760271, -6.9090723, 7.02903814, -8.7236836,
1.4805914, 0.53185824, -5.46626893, 0.69692163, 13.58472666,
0.77199316, 2.02906724, -3.0203244, -1.00533159, -0.57417351]])
# params = np.hstack([50 * np.random.randn(n_samples, 25), np.tile(np.array([np.pi/2, -np.pi/4, -np.pi/4, -np.pi/4, -np.pi/4]), [n_samples, 1])])
rew, info = test_env.rollout(params, render=True)
print(rew)
# out = env(params)
# print(out)

View File

@ -1,28 +0,0 @@
from alr_envs.mujoco.ball_in_a_cup.utils import make_simple_dmp_env
import numpy as np
if __name__ == "__main__":
dim = 15
n_cpus = 4
# n_samples = 10
#
# vec_env = DmpAsyncVectorEnv([make_simple_env(i) for i in range(n_cpus)],
# n_samples=n_samples)
#
# params = np.tile(1 * np.random.randn(n_samples, dim), (10, 1))
#
# rewards, infos = vec_env(params)
# print(rewards)
#
non_vec_env = make_simple_dmp_env(0, 0)()
# params = 0.5 * np.random.randn(dim)
params = np.array([-2.63357598, -1.04950296, -0.44330737, 0.52950017, 4.29247739,
4.52473661, -0.05685977, -0.76796851, 3.71540499, 1.22631059,
2.20412438, 3.91588129, -0.12652723, -3.0788211 , 0.56204464])
out2 = non_vec_env.rollout(params, render=True )
print(out2)

View File

@ -1,156 +0,0 @@
import alr_envs.classic_control.hole_reacher as hr
import alr_envs.classic_control.viapoint_reacher as vpr
from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper
from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper
import numpy as np
def make_viapointreacher_env(rank, seed=0):
"""
Utility function for multiprocessed env.
:param env_id: (str) the environment ID
:param num_env: (int) the number of environments you wish to have in subprocesses
:param seed: (int) the initial seed for RNG
:param rank: (int) index of the subprocess
:returns a function that generates an environment
"""
def _init():
_env = vpr.ViaPointReacher(n_links=5,
allow_self_collision=False,
collision_penalty=1000)
_env = DmpWrapper(_env,
num_dof=5,
num_basis=5,
duration=2,
alpha_phase=2.5,
dt=_env.dt,
start_pos=_env.start_pos,
learn_goal=False,
policy_type="velocity",
weights_scale=50)
_env.seed(seed + rank)
return _env
return _init
def make_holereacher_env(rank, seed=0):
"""
Utility function for multiprocessed env.
:param env_id: (str) the environment ID
:param num_env: (int) the number of environments you wish to have in subprocesses
:param seed: (int) the initial seed for RNG
:param rank: (int) index of the subprocess
:returns a function that generates an environment
"""
def _init():
_env = hr.HoleReacher(n_links=5,
allow_self_collision=False,
allow_wall_collision=False,
hole_width=0.25,
hole_depth=1,
hole_x=2,
collision_penalty=100)
_env = DmpWrapper(_env,
num_dof=5,
num_basis=5,
duration=2,
bandwidth_factor=2,
dt=_env.dt,
learn_goal=True,
alpha_phase=2,
start_pos=_env.start_pos,
policy_type="velocity",
weights_scale=50,
goal_scale=0.1
)
_env.seed(seed + rank)
return _env
return _init
def make_holereacher_fix_goal_env(rank, seed=0):
"""
Utility function for multiprocessed env.
:param env_id: (str) the environment ID
:param num_env: (int) the number of environments you wish to have in subprocesses
:param seed: (int) the initial seed for RNG
:param rank: (int) index of the subprocess
:returns a function that generates an environment
"""
def _init():
_env = hr.HoleReacher(n_links=5,
allow_self_collision=False,
allow_wall_collision=False,
hole_width=0.15,
hole_depth=1,
hole_x=1,
collision_penalty=100)
_env = DmpWrapper(_env,
num_dof=5,
num_basis=5,
duration=2,
dt=_env.dt,
learn_goal=False,
final_pos=np.array([2.02669572, -1.25966385, -1.51618198, -0.80946476, 0.02012344]),
alpha_phase=2,
start_pos=_env.start_pos,
policy_type="velocity",
weights_scale=50,
goal_scale=1
)
_env.seed(seed + rank)
return _env
return _init
def make_holereacher_env_pmp(rank, seed=0):
"""
Utility function for multiprocessed env.
:param env_id: (str) the environment ID
:param num_env: (int) the number of environments you wish to have in subprocesses
:param seed: (int) the initial seed for RNG
:param rank: (int) index of the subprocess
:returns a function that generates an environment
"""
def _init():
_env = hr.HoleReacher(n_links=5,
allow_self_collision=False,
allow_wall_collision=False,
hole_width=0.15,
hole_depth=1,
hole_x=1,
collision_penalty=1000)
_env = DetPMPWrapper(_env,
num_dof=5,
num_basis=5,
width=0.02,
policy_type="velocity",
start_pos=_env.start_pos,
duration=2,
post_traj_time=0,
dt=_env.dt,
weights_scale=0.2,
zero_start=True,
zero_goal=False
)
_env.seed(seed + rank)
return _env
return _init

View File

@ -1,5 +1,5 @@
from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper from alr_envs.utils.mps.dmp_wrapper import DmpWrapper
from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper from alr_envs.utils.mps.detpmp_wrapper import DetPMPWrapper
import gym import gym
from gym.vector.utils import write_to_shared_memory from gym.vector.utils import write_to_shared_memory
import sys import sys

View File

@ -4,8 +4,8 @@ import numpy as np
from _collections import defaultdict from _collections import defaultdict
def make_env(env_id, rank, seed=0): def make_env(env_id, rank, seed=0, **env_kwargs):
env = gym.make(env_id) env = gym.make(env_id, **env_kwargs)
env.seed(seed + rank) env.seed(seed + rank)
return lambda: env return lambda: env
@ -45,9 +45,9 @@ class AlrMpEnvSampler:
An asynchronous sampler for non contextual MPWrapper environments. A sampler object can be called with a set of An asynchronous sampler for non contextual MPWrapper environments. A sampler object can be called with a set of
parameters and returns the corresponding final obs, rewards, dones and info dicts. parameters and returns the corresponding final obs, rewards, dones and info dicts.
""" """
def __init__(self, env_id, num_envs, seed=0): def __init__(self, env_id, num_envs, seed=0, **env_kwargs):
self.num_envs = num_envs self.num_envs = num_envs
self.env = AsyncVectorEnv([make_env(env_id, seed, i) for i in range(num_envs)]) self.env = AsyncVectorEnv([make_env(env_id, seed, i, **env_kwargs) for i in range(num_envs)])
def __call__(self, params): def __call__(self, params):
params = np.atleast_2d(params) params = np.atleast_2d(params)
@ -56,6 +56,7 @@ class AlrMpEnvSampler:
vals = defaultdict(list) vals = defaultdict(list)
for p in split_params: for p in split_params:
self.env.reset()
obs, reward, done, info = self.env.step(p) obs, reward, done, info = self.env.step(p)
vals['obs'].append(obs) vals['obs'].append(obs)
vals['reward'].append(reward) vals['reward'].append(reward)
@ -67,6 +68,37 @@ class AlrMpEnvSampler:
_flatten_list(vals['done'])[:n_samples], _flatten_list(vals['info'])[:n_samples] _flatten_list(vals['done'])[:n_samples], _flatten_list(vals['info'])[:n_samples]
class AlrContextualMpEnvSampler:
"""
An asynchronous sampler for contextual MPWrapper environments. A sampler object can be called with a set of
parameters and returns the corresponding final obs, rewards, dones and info dicts.
"""
def __init__(self, env_id, num_envs, seed=0, **env_kwargs):
self.num_envs = num_envs
self.env = AsyncVectorEnv([make_env(env_id, seed, i, **env_kwargs) for i in range(num_envs)])
def __call__(self, dist, n_samples):
repeat = int(np.ceil(n_samples / self.env.num_envs))
vals = defaultdict(list)
for i in range(repeat):
new_contexts = self.env.reset()
vals['new_contexts'].append(new_contexts)
new_samples, new_contexts = dist.sample(new_contexts)
vals['new_samples'].append(new_samples)
obs, reward, done, info = self.env.step(new_samples)
vals['obs'].append(obs)
vals['reward'].append(reward)
vals['done'].append(done)
vals['info'].append(info)
# do not return values above threshold
return np.vstack(vals['new_samples'])[:n_samples], np.vstack(vals['new_contexts'])[:n_samples], \
np.vstack(vals['obs'])[:n_samples], np.hstack(vals['reward'])[:n_samples], \
_flatten_list(vals['done'])[:n_samples], _flatten_list(vals['info'])[:n_samples]
if __name__ == "__main__": if __name__ == "__main__":
env_name = "alr_envs:ALRBallInACupSimpleDMP-v0" env_name = "alr_envs:ALRBallInACupSimpleDMP-v0"
n_cpu = 8 n_cpu = 8

View File

@ -2,26 +2,27 @@ import gym
import numpy as np import numpy as np
from mp_lib import det_promp from mp_lib import det_promp
from alr_envs.utils.wrapper.mp_wrapper import MPWrapper from alr_envs.utils.mps.mp_environments import MPEnv
from alr_envs.utils.mps.mp_wrapper import MPWrapper
class DetPMPWrapper(MPWrapper): class DetPMPWrapper(MPWrapper):
def __init__(self, env, num_dof, num_basis, width, start_pos=None, duration=1, dt=0.01, post_traj_time=0., def __init__(self, env: MPEnv, num_dof: int, num_basis: int, width: int, duration: int = 1, dt: float = 0.01,
policy_type=None, weights_scale=1, zero_start=False, zero_goal=False, **mp_kwargs): post_traj_time: float = 0., policy_type: str = None, weights_scale: float = 1.,
# self.duration = duration # seconds zero_start: bool = False, zero_goal: bool = False, **mp_kwargs):
self.duration = duration # seconds
super().__init__(env, num_dof, duration, dt, post_traj_time, policy_type, weights_scale, super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, num_basis=num_basis,
num_basis=num_basis, width=width, start_pos=start_pos, zero_start=zero_start, width=width, zero_start=zero_start, zero_goal=zero_goal, **mp_kwargs)
zero_goal=zero_goal)
self.dt = dt
action_bounds = np.inf * np.ones((self.mp.n_basis * self.mp.n_dof)) action_bounds = np.inf * np.ones((self.mp.n_basis * self.mp.n_dof))
self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32)
self.start_pos = start_pos
self.dt = dt
def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, width: float = None, def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, width: float = None,
start_pos: np.ndarray = None, zero_start: bool = False, zero_goal: bool = False): zero_start: bool = False, zero_goal: bool = False):
pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=0.01, pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=0.01,
zero_start=zero_start, zero_goal=zero_goal) zero_start=zero_start, zero_goal=zero_goal)

View File

@ -1,17 +1,18 @@
from mp_lib.phase import ExpDecayPhaseGenerator
from mp_lib.basis import DMPBasisGenerator
from mp_lib import dmps
import numpy as np
import gym import gym
import numpy as np
from mp_lib import dmps
from mp_lib.basis import DMPBasisGenerator
from mp_lib.phase import ExpDecayPhaseGenerator
from alr_envs.utils.wrapper.mp_wrapper import MPWrapper from alr_envs.utils.mps.mp_environments import MPEnv
from alr_envs.utils.mps.mp_wrapper import MPWrapper
class DmpWrapper(MPWrapper): class DmpWrapper(MPWrapper):
def __init__(self, env: gym.Env, num_dof: int, num_basis: int, start_pos: np.ndarray = None, def __init__(self, env: MPEnv, num_dof: int, num_basis: int,
final_pos: np.ndarray = None, duration: int = 1, alpha_phase: float = 2., dt: float = None, duration: int = 1, alpha_phase: float = 2., dt: float = None,
learn_goal: bool = False, return_to_start: bool = False, post_traj_time: float = 0., learn_goal: bool = False, post_traj_time: float = 0.,
weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3., weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3.,
policy_type: str = None, render_mode: str = None): policy_type: str = None, render_mode: str = None):
@ -21,8 +22,6 @@ class DmpWrapper(MPWrapper):
env: env:
num_dof: num_dof:
num_basis: num_basis:
start_pos:
final_pos:
duration: duration:
alpha_phase: alpha_phase:
dt: dt:
@ -35,25 +34,17 @@ class DmpWrapper(MPWrapper):
self.learn_goal = learn_goal self.learn_goal = learn_goal
dt = env.dt if hasattr(env, "dt") else dt dt = env.dt if hasattr(env, "dt") else dt
assert dt is not None assert dt is not None
start_pos = start_pos if start_pos is not None else env.start_pos if hasattr(env, "start_pos") else None
assert start_pos is not None
if learn_goal:
final_pos = np.zeros_like(start_pos) # arbitrary, will be learned
else:
final_pos = final_pos if final_pos is not None else start_pos if return_to_start else None
assert final_pos is not None
self.t = np.linspace(0, duration, int(duration / dt)) self.t = np.linspace(0, duration, int(duration / dt))
self.goal_scale = goal_scale self.goal_scale = goal_scale
super().__init__(env, num_dof, duration, dt, post_traj_time, policy_type, weights_scale, render_mode, super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, render_mode,
num_basis=num_basis, start_pos=start_pos, final_pos=final_pos, alpha_phase=alpha_phase, num_basis=num_basis, alpha_phase=alpha_phase, bandwidth_factor=bandwidth_factor)
bandwidth_factor=bandwidth_factor)
action_bounds = np.inf * np.ones((np.prod(self.mp.dmp_weights.shape) + (num_dof if learn_goal else 0))) action_bounds = np.inf * np.ones((np.prod(self.mp.dmp_weights.shape) + (num_dof if learn_goal else 0)))
self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32)
def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, start_pos: np.ndarray = None, def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, alpha_phase: float = 2.,
final_pos: np.ndarray = None, alpha_phase: float = 2., bandwidth_factor: float = 3.): bandwidth_factor: int = 3):
phase_generator = ExpDecayPhaseGenerator(alpha_phase=alpha_phase, duration=duration) phase_generator = ExpDecayPhaseGenerator(alpha_phase=alpha_phase, duration=duration)
basis_generator = DMPBasisGenerator(phase_generator, duration=duration, num_basis=num_basis, basis_generator = DMPBasisGenerator(phase_generator, duration=duration, num_basis=num_basis,
@ -62,12 +53,6 @@ class DmpWrapper(MPWrapper):
dmp = dmps.DMP(num_dof=num_dof, basis_generator=basis_generator, phase_generator=phase_generator, dmp = dmps.DMP(num_dof=num_dof, basis_generator=basis_generator, phase_generator=phase_generator,
num_time_steps=int(duration / dt), dt=dt) num_time_steps=int(duration / dt), dt=dt)
dmp.dmp_start_pos = start_pos.reshape((1, num_dof))
weights = np.zeros((num_basis, num_dof))
goal_pos = np.zeros(num_dof) if self.learn_goal else final_pos
dmp.set_weights(weights, goal_pos)
return dmp return dmp
def goal_and_weights(self, params): def goal_and_weights(self, params):
@ -77,16 +62,15 @@ class DmpWrapper(MPWrapper):
if self.learn_goal: if self.learn_goal:
goal_pos = params[0, -self.mp.num_dimensions:] # [num_dof] goal_pos = params[0, -self.mp.num_dimensions:] # [num_dof]
params = params[:, :-self.mp.num_dimensions] # [1,num_dof] params = params[:, :-self.mp.num_dimensions] # [1,num_dof]
# weight_matrix = np.reshape(params[:, :-self.num_dof], [self.num_basis, self.num_dof])
else: else:
goal_pos = self.mp.dmp_goal_pos.flatten() goal_pos = self.env.goal_pos
assert goal_pos is not None assert goal_pos is not None
# weight_matrix = np.reshape(params, [self.num_basis, self.num_dof])
weight_matrix = np.reshape(params, self.mp.dmp_weights.shape) weight_matrix = np.reshape(params, self.mp.dmp_weights.shape) # [num_basis, num_dof]
return goal_pos * self.goal_scale, weight_matrix * self.weights_scale return goal_pos * self.goal_scale, weight_matrix * self.weights_scale
def mp_rollout(self, action): def mp_rollout(self, action):
self.mp.dmp_start_pos = self.env.start_pos
goal_pos, weight_matrix = self.goal_and_weights(action) goal_pos, weight_matrix = self.goal_and_weights(action)
self.mp.set_weights(weight_matrix, goal_pos) self.mp.set_weights(weight_matrix, goal_pos)
return self.mp.reference_trajectory(self.t) return self.mp.reference_trajectory(self.t)

View File

@ -0,0 +1,33 @@
from abc import abstractmethod
from typing import Union
import gym
import numpy as np
class MPEnv(gym.Env):
@property
@abstractmethod
def active_obs(self):
"""Returns boolean mask for each observation entry
whether the observation is returned for the contextual case or not.
This effectively allows to filter unwanted or unnecessary observations from the full step-based case.
"""
return np.ones(self.observation_space.shape, dtype=bool)
@property
@abstractmethod
def start_pos(self) -> Union[float, int, np.ndarray]:
"""
Returns the starting position of the joints
"""
raise NotImplementedError()
@property
def goal_pos(self) -> Union[float, int, np.ndarray]:
"""
Returns the current final position of the joints for the MP.
By default this returns the starting position.
"""
return self.start_pos

View File

@ -1,32 +1,24 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections import defaultdict
import gym import gym
import numpy as np import numpy as np
from alr_envs.utils.mps.mp_environments import MPEnv
from alr_envs.utils.policies import get_policy_class from alr_envs.utils.policies import get_policy_class
class MPWrapper(gym.Wrapper, ABC): class MPWrapper(gym.Wrapper, ABC):
def __init__(self, def __init__(self, env: MPEnv, num_dof: int, dt: float, duration: int = 1, post_traj_time: float = 0.,
env: gym.Env, policy_type: str = None, weights_scale: float = 1., render_mode: str = None, **mp_kwargs):
num_dof: int,
duration: int = 1,
dt: float = None,
post_traj_time: float = 0.,
policy_type: str = None,
weights_scale: float = 1.,
render_mode: str = None,
**mp_kwargs
):
super().__init__(env) super().__init__(env)
# self.num_dof = num_dof # adjust observation space to reduce version
# self.num_basis = num_basis obs_sp = self.env.observation_space
# self.duration = duration # seconds self.observation_space = gym.spaces.Box(low=obs_sp.low[self.env.active_obs],
high=obs_sp.high[self.env.active_obs],
dtype=obs_sp.dtype)
# dt = env.dt if hasattr(env, "dt") else dt
assert dt is not None # this should never happen as MPWrapper is a base class assert dt is not None # this should never happen as MPWrapper is a base class
self.post_traj_steps = int(post_traj_time / dt) self.post_traj_steps = int(post_traj_time / dt)
@ -40,8 +32,11 @@ class MPWrapper(gym.Wrapper, ABC):
self.render_mode = render_mode self.render_mode = render_mode
self.render_kwargs = {} self.render_kwargs = {}
# TODO: not yet final # TODO: @Max I think this should not be in this class, this functionality should be part of your sampler.
def __call__(self, params, contexts=None): def __call__(self, params, contexts=None):
"""
Can be used to provide a batch of parameter sets
"""
params = np.atleast_2d(params) params = np.atleast_2d(params)
obs = [] obs = []
rewards = [] rewards = []
@ -61,6 +56,9 @@ class MPWrapper(gym.Wrapper, ABC):
def configure(self, context): def configure(self, context):
self.env.configure(context) self.env.configure(context)
def reset(self):
return self.env.reset()[self.env.active_obs]
def step(self, action: np.ndarray): def step(self, action: np.ndarray):
""" This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step"""
trajectory, velocity = self.mp_rollout(action) trajectory, velocity = self.mp_rollout(action)
@ -73,14 +71,9 @@ class MPWrapper(gym.Wrapper, ABC):
# self._velocity = velocity # self._velocity = velocity
rewards = 0 rewards = 0
# infos = defaultdict(list)
# TODO: @Max Why do we need this configure, states should be part of the model
# TODO: Ask Onur if the context distribution needs to be outside the environment
# TODO: For now create a new env with each context
# self.env.configure(context)
obs = self.env.reset()
info = {} info = {}
# create random obs as the reset function is called externally
obs = self.env.observation_space.sample()
for t, pos_vel in enumerate(zip(trajectory, velocity)): for t, pos_vel in enumerate(zip(trajectory, velocity)):
ac = self.policy.get_action(pos_vel[0], pos_vel[1]) ac = self.policy.get_action(pos_vel[0], pos_vel[1])
@ -94,7 +87,7 @@ class MPWrapper(gym.Wrapper, ABC):
break break
done = True done = True
return obs, rewards, done, info return obs[self.env.active_obs], rewards, done, info
def render(self, mode='human', **kwargs): def render(self, mode='human', **kwargs):
"""Only set render options here, such that they can be used during the rollout. """Only set render options here, such that they can be used during the rollout.
@ -102,18 +95,6 @@ class MPWrapper(gym.Wrapper, ABC):
self.render_mode = mode self.render_mode = mode
self.render_kwargs = kwargs self.render_kwargs = kwargs
# def __call__(self, actions):
# return self.step(actions)
# params = np.atleast_2d(params)
# rewards = []
# infos = []
# for p, c in zip(params, contexts):
# reward, info = self.rollout(p, c)
# rewards.append(reward)
# infos.append(info)
#
# return np.array(rewards), infos
@abstractmethod @abstractmethod
def mp_rollout(self, action): def mp_rollout(self, action):
""" """

View File

@ -46,7 +46,7 @@ def example_dmp():
obs = env.reset() obs = env.reset()
def example_async(n_cpu=4, seed=int('533D', 16)): def example_async(env_id="alr_envs:HoleReacherDMP-v0", n_cpu=4, seed=int('533D', 16)):
def make_env(env_id, seed, rank): def make_env(env_id, seed, rank):
env = gym.make(env_id) env = gym.make(env_id)
env.seed(seed + rank) env.seed(seed + rank)
@ -73,7 +73,7 @@ def example_async(n_cpu=4, seed=int('533D', 16)):
# do not return values above threshold # do not return values above threshold
return (*map(lambda v: np.stack(v)[:n_samples], vals.values()),) return (*map(lambda v: np.stack(v)[:n_samples], vals.values()),)
envs = gym.vector.AsyncVectorEnv([make_env("alr_envs:HoleReacherDMP-v0", seed, i) for i in range(n_cpu)]) envs = gym.vector.AsyncVectorEnv([make_env(env_id, seed, i) for i in range(n_cpu)])
obs = envs.reset() obs = envs.reset()
print(sample(envs, 16)) print(sample(envs, 16))
@ -82,6 +82,6 @@ def example_async(n_cpu=4, seed=int('533D', 16)):
if __name__ == '__main__': if __name__ == '__main__':
# example_mujoco() # example_mujoco()
# example_dmp() # example_dmp()
# example_async() example_async("alr_envs:LongSimpleReacherDMP-v0", 4)
env = gym.make("alr_envs:HoleReacherDMP-v0", context=0.1) # env = gym.make("alr_envs:HoleReacherDMP-v0", context=0.1)
print() # env = gym.make("alr_envs:HoleReacherDMP-v1")