diff --git a/README.md b/README.md index e9f88a1..37b2a93 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,50 @@ -## ALR Custom Environments +## ALR Environments -This repository collects custom RL envs not included in Suits like OpenAI gym, rllab, etc. -Creating a custom (Mujoco) gym environement can be done according to [this guide](https://github.com/openai/gym/blob/master/docs/creating-environments.md). -For stochastic search problems with gym interace use the Rosenbrock reference implementation. +This repository collects custom Robotics environments not included in benchmark suites like OpenAI gym, rllab, etc. +Creating a custom (Mujoco) gym environment can be done according to [this guide](https://github.com/openai/gym/blob/master/docs/creating-environments.md). +For stochastic search problems with gym interface use the `Rosenbrock-v0` reference implementation. +We also support to solve environments with DMPs. When adding new DMP tasks check the `ViaPointReacherDMP-v0` reference implementation. +When simply using the tasks, you can also leverage the wrapper class `DmpWrapper` to turn normal gym environments in to DMP tasks. ## Environments -Currently we have the following environements: +Currently we have the following environments: ### Mujoco -|Name| Description| -|---|---| -|`ALRReacher-v0`|Modified (5 links) Mujoco gym's `Reacher-v2` (2 links)| -|`ALRReacherSparse-v0`|Same as `ALRReacher-v0`, but the distance penalty is only provided in the last time step.| -|`ALRReacherSparseBalanced-v0`|Same as `ALRReacherSparse-v0`, but the the end effector has to stay upright.| -|`ALRReacherShort-v0`|Same as `ALRReacher-v0`, but the episode length is reduced to 50.| -|`ALRReacherShortSparse-v0`|Combination of `ALRReacherSparse-v0` and `ALRReacherShort-v0`.| -|`ALRReacher7-v0`|Modified (7 links) Mujoco gym's `Reacher-v2` (2 links)| -|`ALRReacher7Sparse-v0`|Same as `ALRReacher7-v0`, but the distance penalty is only provided in the last time step.| +|Name| Description|Horizon|Action Dimension|Observation Dimension +|---|---|---|---|---| +|`ALRReacher-v0`|Modified (5 links) Mujoco gym's `Reacher-v2` (2 links)| 200 | 5 | 21 +|`ALRReacherSparse-v0`|Same as `ALRReacher-v0`, but the distance penalty is only provided in the last time step.| 200 | 5 | 21 +|`ALRReacherSparseBalanced-v0`|Same as `ALRReacherSparse-v0`, but the end-effector has to remain upright.| 200 | 5 | 21 +|`ALRLongReacher-v0`|Modified (7 links) Mujoco gym's `Reacher-v2` (2 links)| 200 | 7 | 27 +|`ALRLongReacherSparse-v0`|Same as `ALRLongReacher-v0`, but the distance penalty is only provided in the last time step.| 200 | 7 | 27 +|`ALRLongReacherSparseBalanced-v0`|Same as `ALRLongReacherSparse-v0`, but the end-effector has to remain upright.| 200 | 7 | 27 ### Classic Control -|Name| Description| -|---|---| -|`SimpleReacher-v0`| Simple Reaching Task without any physics simulation. Returns no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions towards the end of the trajectory.| +|Name| Description|Horizon|Action Dimension|Observation Dimension +|---|---|---|---|---| +|`SimpleReacher-v0`| Simple reaching task (2 links) without any physics simulation. Provides no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions towards the end of the trajectory.| 200 | 2 | 9 +|`LongSimpleReacher-v0`| Simple reaching task (5 links) without any physics simulation. Provides no reward until 150 time steps. This allows the agent to explore the space, but requires precise actions towards the end of the trajectory.| 200 | 5 | 18 +|`ViaPointReacher-v0`| Simple reaching task leveraging a via point, which supports self collision detection. Provides a reward only at 100 and 199 for reaching the viapoint and goal point, respectively.| 200 | +|`HoleReacher-v0`| + +### DMP Environments +These environments are closer to stochastic search. They always execute a full trajectory, which is computed by a DMP and executed by a controller, e.g. a PD controller. +The goal is to learn the parameters of this DMP to generate a suitable trajectory. +All environments provide the full episode reward and additional information about early terminations, e.g. due to collisions. + +|Name| Description|Horizon|Action Dimension|Observation Dimension +|---|---|---|---|---| +|`ViaPointReacherDMP-v0`| Simple reaching task leveraging a via point, which supports self collision detection. Provides a reward only at 100 and 199 for reaching the viapoint and goal point, respectively.| 200 | +|`HoleReacherDMP-v0`| +|`HoleReacherFixedGoalDMP-v0`| +|`HoleReacherDetPMP-v0`| ### Stochastic Search -|Name| Description| -|---|---| -|`Rosenbrock{dim}-v0`| Gym interface for Rosenbrock function. `{dim}` is one of 5, 10, 25, 50 or 100. | +|Name| Description|Horizon|Action Dimension|Observation Dimension +|---|---|---|---|---| +|`Rosenbrock{dim}-v0`| Gym interface for Rosenbrock function. `{dim}` is one of 5, 10, 25, 50 or 100. | 1 | `{dim}` | 0 ## Install diff --git a/alr_envs/__init__.py b/alr_envs/__init__.py index 9406da2..7bff610 100644 --- a/alr_envs/__init__.py +++ b/alr_envs/__init__.py @@ -1,6 +1,9 @@ from gym.envs.registration import register from alr_envs.stochastic_search.functions.f_rosenbrock import Rosenbrock +from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper + +# Mujoco register( id='ALRReacher-v0', @@ -9,6 +12,7 @@ register( kwargs={ "steps_before_reward": 0, "n_links": 5, + "balance": False, } ) @@ -19,6 +23,7 @@ register( kwargs={ "steps_before_reward": 200, "n_links": 5, + "balance": False, } ) @@ -34,62 +39,46 @@ register( ) register( - id='ALRReacherShort-v0', - entry_point='alr_envs.mujoco:ALRReacherEnv', - max_episode_steps=50, - kwargs={ - "steps_before_reward": 0, - "n_links": 5, - } -) - -register( - id='ALRReacherShortSparse-v0', - entry_point='alr_envs.mujoco:ALRReacherEnv', - max_episode_steps=50, - kwargs={ - "steps_before_reward": 50, - "n_links": 5, - } -) - -register( - id='ALRReacher7-v0', + id='ALRLongReacher-v0', entry_point='alr_envs.mujoco:ALRReacherEnv', max_episode_steps=200, kwargs={ "steps_before_reward": 0, "n_links": 7, - } -) - -# register( -# id='ALRReacherSparse-v0', -# entry_point='alr_envs.mujoco:ALRReacherEnv', -# max_episode_steps=200, -# kwargs={ -# "steps_before_reward": 200, -# "n_links": 7, -# } -# ) - -register( - id='ALRReacher7Short-v0', - entry_point='alr_envs.mujoco:ALRReacherEnv', - max_episode_steps=50, - kwargs={ - "steps_before_reward": 0, - "n_links": 7, + "balance": False, } ) register( - id='ALRReacher7ShortSparse-v0', + id='ALRLongReacherSparse-v0', entry_point='alr_envs.mujoco:ALRReacherEnv', - max_episode_steps=50, + max_episode_steps=200, kwargs={ - "steps_before_reward": 50, + "steps_before_reward": 200, "n_links": 7, + "balance": False, + } +) + +register( + id='ALRLongReacherSparseBalanced-v0', + entry_point='alr_envs.mujoco:ALRReacherEnv', + max_episode_steps=200, + kwargs={ + "steps_before_reward": 200, + "n_links": 7, + "balance": True, + } +) + +# Classic control + +register( + id='Balancing-v0', + entry_point='alr_envs.mujoco:BalancingEnv', + max_episode_steps=200, + kwargs={ + "n_links": 5, } ) @@ -103,7 +92,7 @@ register( ) register( - id='SimpleReacher5-v0', + id='LongSimpleReacher-v0', entry_point='alr_envs.classic_control:SimpleReacherEnv', max_episode_steps=200, kwargs={ @@ -111,12 +100,66 @@ register( } ) +register( + id='ViaPointReacher-v0', + entry_point='alr_envs.classic_control.viapoint_reacher:ViaPointReacher', + max_episode_steps=200, + kwargs={ + "n_links": 5, + "allow_self_collision": False, + "collision_penalty": 1000 + } +) + +register( + id='HoleReacher-v0', + entry_point='alr_envs.classic_control.hole_reacher:HoleReacher', + max_episode_steps=200, + kwargs={ + "n_links": 5, + "allow_self_collision": False, + "allow_wall_collision": False, + "hole_width": 0.15, + "hole_depth": 1, + "hole_x": 1, + "collision_penalty": 100, + } +) + +# DMP environments + +register( + id='ViaPointReacherDMP-v0', + entry_point='alr_envs.classic_control.viapoint_reacher:viapoint_dmp', + # max_episode_steps=1, +) + +register( + id='HoleReacherDMP-v0', + entry_point='alr_envs.classic_control.hole_reacher:holereacher_dmp', + # max_episode_steps=1, +) + +register( + id='HoleReacherFixedGoalDMP-v0', + entry_point='alr_envs.classic_control.hole_reacher:holereacher_fix_goal_dmp', + # max_episode_steps=1, +) + +register( + id='HoleReacherDetPMP-v0', + entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp', + # max_episode_steps=1, +) + +# BBO functions + for dim in [5, 10, 25, 50, 100]: register( id=f'Rosenbrock{dim}-v0', entry_point='alr_envs.stochastic_search:StochasticSearchEnv', max_episode_steps=1, kwargs={ - "cost_f": Rosenbrock, + "cost_f": Rosenbrock(dim), } ) diff --git a/alr_envs/classic_control/__init__.py b/alr_envs/classic_control/__init__.py index 53e2448..a831c43 100644 --- a/alr_envs/classic_control/__init__.py +++ b/alr_envs/classic_control/__init__.py @@ -1 +1,3 @@ from alr_envs.classic_control.simple_reacher import SimpleReacherEnv +from alr_envs.classic_control.viapoint_reacher import ViaPointReacher +from alr_envs.classic_control.hole_reacher import HoleReacher \ No newline at end of file diff --git a/alr_envs/classic_control/hole_reacher.py b/alr_envs/classic_control/hole_reacher.py index 2b39a2c..8f8feda 100644 --- a/alr_envs/classic_control/hole_reacher.py +++ b/alr_envs/classic_control/hole_reacher.py @@ -3,9 +3,12 @@ import numpy as np import matplotlib.pyplot as plt from matplotlib import patches +from alr_envs import DmpWrapper +from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper + def ccw(A, B, C): - return (C[1]-A[1]) * (B[0]-A[0]) - (B[1]-A[1]) * (C[0]-A[0]) > 1e-12 + return (C[1] - A[1]) * (B[0] - A[0]) - (B[1] - A[1]) * (C[0] - A[0]) > 1e-12 # Return true if line segments AB and CD intersect @@ -13,37 +16,66 @@ def intersect(A, B, C, D): return ccw(A, C, D) != ccw(B, C, D) and ccw(A, B, C) != ccw(A, B, D) +def holereacher_dmp(**kwargs): + _env = gym.make("alr_envs:HoleReacher-v0") + # _env = HoleReacher(**kwargs) + return DmpWrapper(_env, num_dof=5, num_basis=5, duration=2, dt=_env.dt, learn_goal=True, alpha_phase=3.5, + start_pos=_env.start_pos, policy_type="velocity", weights_scale=100, goal_scale=0.1) + + +def holereacher_fix_goal_dmp(**kwargs): + _env = gym.make("alr_envs:HoleReacher-v0") + # _env = HoleReacher(**kwargs) + return DmpWrapper(_env, num_dof=5, num_basis=5, duration=2, dt=_env.dt, learn_goal=False, alpha_phase=3.5, + start_pos=_env.start_pos, policy_type="velocity", weights_scale=50, goal_scale=1, + final_pos=np.array([2.02669572, -1.25966385, -1.51618198, -0.80946476, 0.02012344])) + + +def holereacher_detpmp(**kwargs): + _env = gym.make("alr_envs:HoleReacher-v0") + # _env = HoleReacher(**kwargs) + return DetPMPWrapper(_env, num_dof=5, num_basis=5, width=0.005, policy_type="velocity", start_pos=_env.start_pos, + duration=2, post_traj_time=0, dt=_env.dt, weights_scale=0.25, zero_start=True, zero_goal=False) + + class HoleReacher(gym.Env): - def __init__(self, num_links, hole_x, hole_width, hole_depth, allow_self_collision=False, + def __init__(self, n_links, hole_x, hole_width, hole_depth, allow_self_collision=False, allow_wall_collision=False, collision_penalty=1000): + + self.n_links = n_links + self.link_lengths = np.ones((n_links, 1)) + + # task self.hole_x = hole_x # x-position of center of hole self.hole_width = hole_width # width of hole self.hole_depth = hole_depth # depth of hole - self.num_links = num_links - self.link_lengths = np.ones((num_links, 1)) + self.bottom_center_of_hole = np.hstack([hole_x, -hole_depth]) self.top_center_of_hole = np.hstack([hole_x, 0]) - self.left_wall_edge = np.hstack([hole_x - self.hole_width/2, 0]) + self.left_wall_edge = np.hstack([hole_x - self.hole_width / 2, 0]) self.right_wall_edge = np.hstack([hole_x + self.hole_width / 2, 0]) + + # collision self.allow_self_collision = allow_self_collision self.allow_wall_collision = allow_wall_collision self.collision_penalty = collision_penalty + # state self._joints = None self._joint_angles = None self._angle_velocity = None - self.start_pos = np.hstack([[np.pi/2], np.zeros(self.num_links - 1)]) - self.start_vel = np.zeros(self.num_links) + self.start_pos = np.hstack([[np.pi / 2], np.zeros(self.n_links - 1)]) + self.start_vel = np.zeros(self.n_links) self.dt = 0.01 - self.time_limit = 2 + # self.time_limit = 2 - action_bound = np.pi * np.ones((self.num_links,)) + action_bound = np.pi * np.ones((self.n_links,)) state_bound = np.hstack([ - [np.pi] * self.num_links, # cos - [np.pi] * self.num_links, # sin - [np.inf] * self.num_links, # velocity + [np.pi] * self.n_links, # cos + [np.pi] * self.n_links, # sin + [np.inf] * self.n_links, # velocity [np.inf] * 2, # x-y coordinates of target distance [np.inf] # env steps, because reward start after n steps TODO: Maybe ]) @@ -51,11 +83,11 @@ class HoleReacher(gym.Env): self.observation_space = gym.spaces.Box(low=-state_bound, high=state_bound, shape=state_bound.shape) self.fig = None - rect_1 = patches.Rectangle((-self.num_links, -1), - self.num_links + self.hole_x - self.hole_width / 2, 1, + rect_1 = patches.Rectangle((-self.n_links, -1), + self.n_links + self.hole_x - self.hole_width / 2, 1, fill=True, edgecolor='k', facecolor='k') rect_2 = patches.Rectangle((self.hole_x + self.hole_width / 2, -1), - self.num_links - self.hole_x + self.hole_width / 2, 1, + self.n_links - self.hole_x + self.hole_width / 2, 1, fill=True, edgecolor='k', facecolor='k') rect_3 = patches.Rectangle((self.hole_x - self.hole_width / 2, -1), self.hole_width, 1 - self.hole_depth, @@ -65,7 +97,7 @@ class HoleReacher(gym.Env): @property def end_effector(self): - return self._joints[self.num_links].T + return self._joints[self.n_links].T def configure(self, context): pass @@ -73,13 +105,13 @@ class HoleReacher(gym.Env): def reset(self): self._joint_angles = self.start_pos self._angle_velocity = self.start_vel - self._joints = np.zeros((self.num_links + 1, 2)) + self._joints = np.zeros((self.n_links + 1, 2)) self._update_joints() self._steps = 0 return self._get_obs().copy() - def step(self, action): + def step(self, action: np.ndarray): """ a single step with an action in joint velocity space """ @@ -114,7 +146,8 @@ class HoleReacher(gym.Env): self._steps += 1 - done = self._steps * self.dt > self.time_limit or self._is_collided + # done = self._steps * self.dt > self.time_limit or self._is_collided + done = self._is_collided return self._get_obs().copy(), reward, done, info @@ -152,18 +185,6 @@ class HoleReacher(gym.Env): self._steps ]) - # def _reward(self): - # dist_reward = 0 - # if not self._is_collided: - # if self._steps == 180: - # dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) - # else: - # dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) - # - # out = - dist_reward ** 2 - # - # return out - def get_forward_kinematics(self, num_points_per_link=1): theta = self._joint_angles[:, None] @@ -174,7 +195,7 @@ class HoleReacher(gym.Env): accumulated_theta = np.cumsum(theta, axis=0) - endeffector = np.zeros(shape=(self.num_links, num_points_per_link, 2)) + endeffector = np.zeros(shape=(self.n_links, num_points_per_link, 2)) x = np.cos(accumulated_theta) * self.link_lengths * intermediate_points y = np.sin(accumulated_theta) * self.link_lengths * intermediate_points @@ -182,7 +203,7 @@ class HoleReacher(gym.Env): endeffector[0, :, 0] = x[0, :] endeffector[0, :, 1] = y[0, :] - for i in range(1, self.num_links): + for i in range(1, self.n_links): endeffector[i, :, 0] = x[i, :] + endeffector[i - 1, -1, 0] endeffector[i, :, 1] = y[i, :] + endeffector[i - 1, -1, 1] @@ -190,7 +211,7 @@ class HoleReacher(gym.Env): def check_self_collision(self, line_points): for i, line1 in enumerate(line_points): - for line2 in line_points[i+2:, :, :]: + for line2 in line_points[i + 2:, :, :]: # if line1 != line2: if intersect(line1[0], line1[-1], line2[0], line2[-1]): return True @@ -218,7 +239,7 @@ class HoleReacher(gym.Env): # all points that are above the hole r, c = np.where((line_points[:, :, 0] > (self.hole_x - self.hole_width / 2)) & ( - line_points[:, :, 0] < (self.hole_x + self.hole_width / 2))) + line_points[:, :, 0] < (self.hole_x + self.hole_width / 2))) # check if any of those points are below surface nr_line_points_below_surface_in_hole = np.sum(line_points[r, c, 1] < -self.hole_depth) @@ -250,7 +271,7 @@ class HoleReacher(gym.Env): plt.xlim([-lim, lim]) plt.ylim([-1.1, lim]) # plt.draw() - plt.pause(1e-4) # pushes window to foreground, which is annoying. + plt.pause(1e-4) # pushes window to foreground, which is annoying. # self.fig.canvas.flush_events() elif mode == "partial": @@ -280,7 +301,7 @@ class HoleReacher(gym.Env): # Add the patch to the Axes [plt.gca().add_patch(rect) for rect in self.patches] - plt.xlim(-self.num_links, self.num_links), plt.ylim(-1, self.num_links) + plt.xlim(-self.n_links, self.n_links), plt.ylim(-1, self.n_links) # Arm plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k') @@ -294,7 +315,8 @@ class HoleReacher(gym.Env): if __name__ == '__main__': nl = 5 render_mode = "human" # "human" or "partial" or "final" - env = HoleReacher(num_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, hole_depth=1, hole_x=1) + env = HoleReacher(n_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, + hole_depth=1, hole_x=1) env.reset() # env.render(mode=render_mode) diff --git a/alr_envs/classic_control/simple_reacher.py b/alr_envs/classic_control/simple_reacher.py index f547ad6..296662c 100644 --- a/alr_envs/classic_control/simple_reacher.py +++ b/alr_envs/classic_control/simple_reacher.py @@ -1,11 +1,9 @@ -import os - import gym -import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np from gym import spaces from gym.utils import seeding + from alr_envs.utils.utils import angle_normalize @@ -33,7 +31,7 @@ class SimpleReacherEnv(gym.Env): self._angle_velocity = None self.max_torque = 1 # 10 - self.steps_before_reward = 180 + self.steps_before_reward = 199 action_bound = np.ones((self.n_links,)) state_bound = np.hstack([ @@ -92,7 +90,7 @@ class SimpleReacherEnv(gym.Env): def _update_joints(self): """ - update _joints to get new end effector position. The other links are only required for rendering. + update joints to get new end-effector position. The other links are only required for rendering. Returns: """ @@ -106,7 +104,7 @@ class SimpleReacherEnv(gym.Env): # TODO: Is this the best option if self._steps >= self.steps_before_reward: - reward_dist = - np.linalg.norm(diff) + reward_dist -= np.linalg.norm(diff) # reward_dist = np.exp(-0.1 * diff ** 2).mean() # reward_dist = - (diff ** 2).mean() diff --git a/alr_envs/classic_control/utils.py b/alr_envs/classic_control/utils.py index 89061cb..5e362da 100644 --- a/alr_envs/classic_control/utils.py +++ b/alr_envs/classic_control/utils.py @@ -1,7 +1,7 @@ from alr_envs.classic_control.hole_reacher import HoleReacher from alr_envs.classic_control.viapoint_reacher import ViaPointReacher -from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper -from alr_envs.utils.detpmp_env_wrapper import DetPMPEnvWrapper +from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper +from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper import numpy as np @@ -17,20 +17,20 @@ def make_viapointreacher_env(rank, seed=0): """ def _init(): - _env = ViaPointReacher(num_links=5, + _env = ViaPointReacher(n_links=5, allow_self_collision=False, collision_penalty=1000) - _env = DmpEnvWrapper(_env, - num_dof=5, - num_basis=5, - duration=2, - alpha_phase=2.5, - dt=_env.dt, - start_pos=_env.start_pos, - learn_goal=False, - policy_type="velocity", - weights_scale=50) + _env = DmpWrapper(_env, + num_dof=5, + num_basis=5, + duration=2, + alpha_phase=2.5, + dt=_env.dt, + start_pos=_env.start_pos, + learn_goal=False, + policy_type="velocity", + weights_scale=50) _env.seed(seed + rank) return _env @@ -49,7 +49,7 @@ def make_holereacher_env(rank, seed=0): """ def _init(): - _env = HoleReacher(num_links=5, + _env = HoleReacher(n_links=5, allow_self_collision=False, allow_wall_collision=False, hole_width=0.25, @@ -57,19 +57,18 @@ def make_holereacher_env(rank, seed=0): hole_x=2, collision_penalty=100) - _env = DmpEnvWrapper(_env, - num_dof=5, - num_basis=5, - duration=2, - bandwidth_factor=2, - dt=_env.dt, - learn_goal=True, - alpha_phase=2, - start_pos=_env.start_pos, - policy_type="velocity", - weights_scale=50, - goal_scale=0.1 - ) + _env = DmpWrapper(_env, + num_dof=5, + num_basis=5, + duration=2, + dt=_env.dt, + learn_goal=True, + alpha_phase=2, + start_pos=_env.start_pos, + policy_type="velocity", + weights_scale=50, + goal_scale=0.1 + ) _env.seed(seed + rank) return _env @@ -89,7 +88,7 @@ def make_holereacher_fix_goal_env(rank, seed=0): """ def _init(): - _env = HoleReacher(num_links=5, + _env = HoleReacher(n_links=5, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, @@ -97,19 +96,19 @@ def make_holereacher_fix_goal_env(rank, seed=0): hole_x=1, collision_penalty=100) - _env = DmpEnvWrapper(_env, - num_dof=5, - num_basis=5, - duration=2, - dt=_env.dt, - learn_goal=False, - final_pos=np.array([2.02669572, -1.25966385, -1.51618198, -0.80946476, 0.02012344]), - alpha_phase=3, - start_pos=_env.start_pos, - policy_type="velocity", - weights_scale=50, - goal_scale=1 - ) + _env = DmpWrapper(_env, + num_dof=5, + num_basis=5, + duration=2, + dt=_env.dt, + learn_goal=False, + final_pos=np.array([2.02669572, -1.25966385, -1.51618198, -0.80946476, 0.02012344]), + alpha_phase=2, + start_pos=_env.start_pos, + policy_type="velocity", + weights_scale=50, + goal_scale=1 + ) _env.seed(seed + rank) return _env @@ -129,28 +128,27 @@ def make_holereacher_env_pmp(rank, seed=0): """ def _init(): - _env = HoleReacher(num_links=5, + _env = HoleReacher(n_links=5, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, hole_depth=1, hole_x=1, - collision_penalty=100) + collision_penalty=1000) - _env = DetPMPEnvWrapper(_env, - num_dof=5, - num_basis=5, - width=0.02, - off=0., - policy_type="velocity", - start_pos=_env.start_pos, - duration=2, - post_traj_time=0, - dt=_env.dt, - weights_scale=0.2, - zero_start=False, - zero_goal=False - ) + _env = DetPMPWrapper(_env, + num_dof=5, + num_basis=5, + width=0.02, + policy_type="velocity", + start_pos=_env.start_pos, + duration=2, + post_traj_time=0, + dt=_env.dt, + weights_scale=0.2, + zero_start=True, + zero_goal=False + ) _env.seed(seed + rank) return _env diff --git a/alr_envs/classic_control/viapoint_reacher.py b/alr_envs/classic_control/viapoint_reacher.py index ed915b8..2278bbc 100644 --- a/alr_envs/classic_control/viapoint_reacher.py +++ b/alr_envs/classic_control/viapoint_reacher.py @@ -1,39 +1,43 @@ import gym -import numpy as np import matplotlib.pyplot as plt -from matplotlib import patches +import numpy as np + +from alr_envs import DmpWrapper +from alr_envs.utils.utils import check_self_collision -def ccw(A, B, C): - return (C[1]-A[1]) * (B[0]-A[0]) - (B[1]-A[1]) * (C[0]-A[0]) > 1e-12 - - -# Return true if line segments AB and CD intersect -def intersect(A, B, C, D): - return ccw(A, C, D) != ccw(B, C, D) and ccw(A, B, C) != ccw(A, B, D) +def viapoint_dmp(**kwargs): + _env = gym.make("alr_envs:ViaPointReacher-v0") + # _env = ViaPointReacher(**kwargs) + return DmpWrapper(_env, num_dof=5, num_basis=5, duration=2, alpha_phase=2.5, dt=_env.dt, + start_pos=_env.start_pos, learn_goal=False, policy_type="velocity", weights_scale=50) class ViaPointReacher(gym.Env): - def __init__(self, num_links, allow_self_collision=False, - collision_penalty=1000): - self.num_links = num_links - self.link_lengths = np.ones((num_links, 1)) + def __init__(self, n_links, allow_self_collision=False, collision_penalty=1000): + self.num_links = n_links + self.link_lengths = np.ones((n_links, 1)) + + # task + self.via_point = np.ones(2) + self.goal_point = np.array((n_links, 0)) + + # collision self.allow_self_collision = allow_self_collision self.collision_penalty = collision_penalty - self.via_point = np.ones(2) - self.goal_point = np.array((num_links, 0)) - + # state self._joints = None self._joint_angles = None self._angle_velocity = None - self.start_pos = np.hstack([[np.pi/2], np.zeros(self.num_links - 1)]) + self.start_pos = np.hstack([[np.pi / 2], np.zeros(self.num_links - 1)]) self.start_vel = np.zeros(self.num_links) self.weight_matrix_scale = 1 + self._steps = 0 self.dt = 0.01 - self.time_limit = 2 + # self.time_limit = 2 action_bound = np.pi * np.ones((self.num_links,)) state_bound = np.hstack([ @@ -64,7 +68,7 @@ class ViaPointReacher(gym.Env): return self._get_obs().copy() - def step(self, action): + def step(self, action: np.ndarray): """ a single step with an action in joint velocity space """ @@ -75,24 +79,18 @@ class ViaPointReacher(gym.Env): self._update_joints() - # rew = self._reward() - - # compute reward directly in step function - dist_reward = 0 if not self._is_collided: if self._steps == 100: dist_reward = np.linalg.norm(self.end_effector - self.via_point) - if self._steps == 199: + elif self._steps == 199: dist_reward = np.linalg.norm(self.end_effector - self.goal_point) + # TODO: Do we need that? reward = - dist_reward ** 2 reward -= 5e-8 * np.sum(acc**2) - if self._steps == 200: - reward -= 0.1 * np.sum(vel**2) ** 2 - if self._is_collided: reward -= self.collision_penalty @@ -100,7 +98,8 @@ class ViaPointReacher(gym.Env): self._steps += 1 - done = self._steps * self.dt > self.time_limit or self._is_collided + # done = self._steps * self.dt > self.time_limit or self._is_collided + done = self._is_collided return self._get_obs().copy(), reward, done, info @@ -118,8 +117,8 @@ class ViaPointReacher(gym.Env): self_collision = False if not self.allow_self_collision: - self_collision = self.check_self_collision(line_points_in_taskspace) - if np.any(np.abs(self._joint_angles) > np.pi) and not self.allow_self_collision: + self_collision = check_self_collision(line_points_in_taskspace) + if np.any(np.abs(self._joint_angles) > np.pi): self_collision = True self._is_collided = self_collision @@ -135,25 +134,10 @@ class ViaPointReacher(gym.Env): self._steps ]) - # def _reward(self): - # dist_reward = 0 - # if not self._is_collided: - # if self._steps == 180: - # dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) - # else: - # dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) - # - # out = - dist_reward ** 2 - # - # return out - def get_forward_kinematics(self, num_points_per_link=1): theta = self._joint_angles[:, None] - if num_points_per_link > 1: - intermediate_points = np.linspace(0, 1, num_points_per_link) - else: - intermediate_points = 1 + intermediate_points = np.linspace(0, 1, num_points_per_link) if num_points_per_link > 1 else 1 accumulated_theta = np.cumsum(theta, axis=0) @@ -171,46 +155,6 @@ class ViaPointReacher(gym.Env): return np.squeeze(endeffector + self._joints[0, :]) - def check_self_collision(self, line_points): - for i, line1 in enumerate(line_points): - for line2 in line_points[i+2:, :, :]: - # if line1 != line2: - if intersect(line1[0], line1[-1], line2[0], line2[-1]): - return True - return False - - def check_wall_collision(self, line_points): - - # all points that are before the hole in x - r, c = np.where(line_points[:, :, 0] < (self.hole_x - self.hole_width / 2)) - - # check if any of those points are below surface - nr_line_points_below_surface_before_hole = np.sum(line_points[r, c, 1] < 0) - - if nr_line_points_below_surface_before_hole > 0: - return True - - # all points that are after the hole in x - r, c = np.where(line_points[:, :, 0] > (self.hole_x + self.hole_width / 2)) - - # check if any of those points are below surface - nr_line_points_below_surface_after_hole = np.sum(line_points[r, c, 1] < 0) - - if nr_line_points_below_surface_after_hole > 0: - return True - - # all points that are above the hole - r, c = np.where((line_points[:, :, 0] > (self.hole_x - self.hole_width / 2)) & ( - line_points[:, :, 0] < (self.hole_x + self.hole_width / 2))) - - # check if any of those points are below surface - nr_line_points_below_surface_in_hole = np.sum(line_points[r, c, 1] < -self.hole_depth) - - if nr_line_points_below_surface_in_hole > 0: - return True - - return False - def render(self, mode='human'): if self.fig is None: self.fig = plt.figure() @@ -230,7 +174,7 @@ class ViaPointReacher(gym.Env): plt.xlim([-lim, lim]) plt.ylim([-lim, lim]) # plt.draw() - plt.pause(1e-4) # pushes window to foreground, which is annoying. + plt.pause(1e-4) # pushes window to foreground, which is annoying. # self.fig.canvas.flush_events() elif mode == "partial": @@ -274,7 +218,7 @@ class ViaPointReacher(gym.Env): if __name__ == '__main__': nl = 5 render_mode = "human" # "human" or "partial" or "final" - env = ViaPointReacher(num_links=nl, allow_self_collision=False) + env = ViaPointReacher(n_links=nl, allow_self_collision=False) env.reset() env.render(mode=render_mode) diff --git a/alr_envs/mujoco/__init__.py b/alr_envs/mujoco/__init__.py index e0ecd85..185f0ed 100644 --- a/alr_envs/mujoco/__init__.py +++ b/alr_envs/mujoco/__init__.py @@ -1 +1,2 @@ -from alr_envs.mujoco.reacher.alr_reacher import ALRReacherEnv \ No newline at end of file +from alr_envs.mujoco.reacher.alr_reacher import ALRReacherEnv +from alr_envs.mujoco.balancing import BalancingEnv diff --git a/alr_envs/mujoco/balancing.py b/alr_envs/mujoco/balancing.py new file mode 100644 index 0000000..5976bc2 --- /dev/null +++ b/alr_envs/mujoco/balancing.py @@ -0,0 +1,53 @@ +import os + +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env + +from alr_envs.utils.utils import angle_normalize + + +class BalancingEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self, n_links=5): + utils.EzPickle.__init__(**locals()) + + self.n_links = n_links + + if n_links == 5: + file_name = 'reacher_5links.xml' + elif n_links == 7: + file_name = 'reacher_7links.xml' + else: + raise ValueError(f"Invalid number of links {n_links}, only 5 or 7 allowed.") + + mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", file_name), 2) + + def step(self, a): + angle = angle_normalize(np.sum(self.sim.data.qpos.flat[:self.n_links]), type="rad") + reward = - np.abs(angle) + + self.do_simulation(a, self.frame_skip) + ob = self._get_obs() + done = False + return ob, reward, done, dict(angle=angle, end_effector=self.get_body_com("fingertip").copy()) + + def viewer_setup(self): + self.viewer.cam.trackbodyid = 1 + + def reset_model(self): + # This also generates a goal, we however do not need/use it + qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos + qpos[-2:] = 0 + qvel = self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) + qvel[-2:] = 0 + self.set_state(qpos, qvel) + + return self._get_obs() + + def _get_obs(self): + theta = self.sim.data.qpos.flat[:self.n_links] + return np.concatenate([ + np.cos(theta), + np.sin(theta), + self.sim.data.qvel.flat[:self.n_links], # this is angular velocity + ]) diff --git a/alr_envs/mujoco/ball_in_a_cup/utils.py b/alr_envs/mujoco/ball_in_a_cup/utils.py index 6508ca2..1f421f5 100644 --- a/alr_envs/mujoco/ball_in_a_cup/utils.py +++ b/alr_envs/mujoco/ball_in_a_cup/utils.py @@ -1,5 +1,5 @@ -from alr_envs.utils.detpmp_env_wrapper import DetPMPEnvWrapper -from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper +from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper +from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv as ALRBallInACupEnvSimple @@ -18,19 +18,19 @@ def make_contextual_env(rank, seed=0): def _init(): env = ALRBallInACupEnv() - env = DetPMPEnvWrapper(env, - num_dof=7, - num_basis=5, - width=0.005, - policy_type="motor", - start_pos=env.start_pos, - duration=3.5, - post_traj_time=4.5, - dt=env.dt, - weights_scale=0.5, - zero_start=True, - zero_goal=True - ) + env = DetPMPWrapper(env, + num_dof=7, + num_basis=5, + width=0.005, + policy_type="motor", + start_pos=env.start_pos, + duration=3.5, + post_traj_time=4.5, + dt=env.dt, + weights_scale=0.5, + zero_start=True, + zero_goal=True + ) env.seed(seed + rank) return env @@ -52,19 +52,19 @@ def make_env(rank, seed=0): def _init(): env = ALRBallInACupEnvSimple() - env = DetPMPEnvWrapper(env, - num_dof=7, - num_basis=5, - width=0.005, - policy_type="motor", - start_pos=env.start_pos, - duration=3.5, - post_traj_time=4.5, - dt=env.dt, - weights_scale=0.2, - zero_start=True, - zero_goal=True - ) + env = DetPMPWrapper(env, + num_dof=7, + num_basis=5, + width=0.005, + policy_type="motor", + start_pos=env.start_pos, + duration=3.5, + post_traj_time=4.5, + dt=env.dt, + weights_scale=0.2, + zero_start=True, + zero_goal=True + ) env.seed(seed + rank) return env @@ -86,20 +86,20 @@ def make_simple_env(rank, seed=0): def _init(): env = ALRBallInACupEnvSimple() - env = DetPMPEnvWrapper(env, - num_dof=3, - num_basis=5, - width=0.005, - off=-0.1, - policy_type="motor", - start_pos=env.start_pos[1::2], - duration=3.5, - post_traj_time=4.5, - dt=env.dt, - weights_scale=0.25, - zero_start=True, - zero_goal=True - ) + env = DetPMPWrapper(env, + num_dof=3, + num_basis=5, + width=0.005, + off=-0.1, + policy_type="motor", + start_pos=env.start_pos[1::2], + duration=3.5, + post_traj_time=4.5, + dt=env.dt, + weights_scale=0.25, + zero_start=True, + zero_goal=True + ) env.seed(seed + rank) return env @@ -121,20 +121,20 @@ def make_simple_dmp_env(rank, seed=0): def _init(): _env = ALRBallInACupEnvSimple() - _env = DmpEnvWrapper(_env, - num_dof=3, - num_basis=5, - duration=3.5, - post_traj_time=4.5, - bandwidth_factor=2.5, - dt=_env.dt, - learn_goal=False, - alpha_phase=3, - start_pos=_env.start_pos[1::2], - final_pos=_env.start_pos[1::2], - policy_type="motor", - weights_scale=100, - ) + _env = DmpWrapper(_env, + num_dof=3, + num_basis=5, + duration=3.5, + post_traj_time=4.5, + bandwidth_factor=2.5, + dt=_env.dt, + learn_goal=False, + alpha_phase=3, + start_pos=_env.start_pos[1::2], + final_pos=_env.start_pos[1::2], + policy_type="motor", + weights_scale=100, + ) _env.seed(seed + rank) return _env diff --git a/alr_envs/mujoco/beerpong/utils.py b/alr_envs/mujoco/beerpong/utils.py index 446dba1..bfbc2a1 100644 --- a/alr_envs/mujoco/beerpong/utils.py +++ b/alr_envs/mujoco/beerpong/utils.py @@ -1,4 +1,4 @@ -from alr_envs.utils.detpmp_env_wrapper import DetPMPEnvWrapper +from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper from alr_envs.mujoco.beerpong.beerpong import ALRBeerpongEnv from alr_envs.mujoco.beerpong.beerpong_simple import ALRBeerpongEnv as ALRBeerpongEnvSimple @@ -17,19 +17,19 @@ def make_contextual_env(rank, seed=0): def _init(): env = ALRBeerpongEnv() - env = DetPMPEnvWrapper(env, - num_dof=7, - num_basis=5, - width=0.005, - policy_type="motor", - start_pos=env.start_pos, - duration=3.5, - post_traj_time=4.5, - dt=env.dt, - weights_scale=0.5, - zero_start=True, - zero_goal=True - ) + env = DetPMPWrapper(env, + num_dof=7, + num_basis=5, + width=0.005, + policy_type="motor", + start_pos=env.start_pos, + duration=3.5, + post_traj_time=4.5, + dt=env.dt, + weights_scale=0.5, + zero_start=True, + zero_goal=True + ) env.seed(seed + rank) return env @@ -51,19 +51,19 @@ def make_env(rank, seed=0): def _init(): env = ALRBeerpongEnvSimple() - env = DetPMPEnvWrapper(env, - num_dof=7, - num_basis=5, - width=0.005, - policy_type="motor", - start_pos=env.start_pos, - duration=3.5, - post_traj_time=4.5, - dt=env.dt, - weights_scale=0.25, - zero_start=True, - zero_goal=True - ) + env = DetPMPWrapper(env, + num_dof=7, + num_basis=5, + width=0.005, + policy_type="motor", + start_pos=env.start_pos, + duration=3.5, + post_traj_time=4.5, + dt=env.dt, + weights_scale=0.25, + zero_start=True, + zero_goal=True + ) env.seed(seed + rank) return env @@ -85,19 +85,19 @@ def make_simple_env(rank, seed=0): def _init(): env = ALRBeerpongEnvSimple() - env = DetPMPEnvWrapper(env, - num_dof=3, - num_basis=5, - width=0.005, - policy_type="motor", - start_pos=env.start_pos[1::2], - duration=3.5, - post_traj_time=4.5, - dt=env.dt, - weights_scale=0.5, - zero_start=True, - zero_goal=True - ) + env = DetPMPWrapper(env, + num_dof=3, + num_basis=5, + width=0.005, + policy_type="motor", + start_pos=env.start_pos[1::2], + duration=3.5, + post_traj_time=4.5, + dt=env.dt, + weights_scale=0.5, + zero_start=True, + zero_goal=True + ) env.seed(seed + rank) return env diff --git a/alr_envs/mujoco/reacher/alr_reacher.py b/alr_envs/mujoco/reacher/alr_reacher.py index 85adf58..0fa8fb0 100644 --- a/alr_envs/mujoco/reacher/alr_reacher.py +++ b/alr_envs/mujoco/reacher/alr_reacher.py @@ -9,6 +9,8 @@ from alr_envs.utils.utils import angle_normalize class ALRReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self, steps_before_reward=200, n_links=5, balance=False): + utils.EzPickle.__init__(**locals()) + self._steps = 0 self.steps_before_reward = steps_before_reward self.n_links = n_links @@ -29,65 +31,48 @@ class ALRReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle): else: raise ValueError(f"Invalid number of links {n_links}, only 5 or 7 allowed.") - self._q_pos = [] - self._q_vel = [] - - utils.EzPickle.__init__(self) mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", file_name), 2) - @property - def current_pos(self): - return self.sim.data.qpos[0:5].copy() - - @property - def current_vel(self): - return self.sim.data.qvel[0:5].copy() - def step(self, a): self._steps += 1 reward_dist = 0.0 angular_vel = 0.0 + reward_balance = 0.0 if self._steps >= self.steps_before_reward: vec = self.get_body_com("fingertip") - self.get_body_com("target") reward_dist -= self.reward_weight * np.linalg.norm(vec) angular_vel -= np.linalg.norm(self.sim.data.qvel.flat[:self.n_links]) reward_ctrl = - np.square(a).sum() - reward_balance = - self.balance_weight * np.abs( - angle_normalize(np.sum(self.sim.data.qpos.flat[:self.n_links]), type="rad")) + + if self.balance: + reward_balance -= self.balance_weight * np.abs( + angle_normalize(np.sum(self.sim.data.qpos.flat[:self.n_links]), type="rad")) reward = reward_dist + reward_ctrl + angular_vel + reward_balance self.do_simulation(a, self.frame_skip) - - self._q_pos.append(self.sim.data.qpos[0:5].ravel().copy()) - self._q_vel.append(self.sim.data.qvel[0:5].ravel().copy()) - ob = self._get_obs() done = False return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl, velocity=angular_vel, reward_balance=reward_balance, end_effector=self.get_body_com("fingertip").copy(), - goal=self.goal if hasattr(self, "goal") else None, - traj=self._q_pos, vel=self._q_vel) + goal=self.goal if hasattr(self, "goal") else None) def viewer_setup(self): self.viewer.cam.trackbodyid = 0 def reset_model(self): - qpos = self.init_qpos # self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos + qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos while True: self.goal = self.np_random.uniform(low=-self.n_links / 10, high=self.n_links / 10, size=2) if np.linalg.norm(self.goal) < self.n_links / 10: break qpos[-2:] = self.goal - qvel = self.init_qvel # + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) + qvel = self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) qvel[-2:] = 0 self.set_state(qpos, qvel) self._steps = 0 - self._q_pos = [] - self._q_vel = [] - return self._get_obs() def _get_obs(self): diff --git a/alr_envs/utils/detpmp_env_wrapper.py b/alr_envs/utils/detpmp_env_wrapper.py deleted file mode 100644 index c667abf..0000000 --- a/alr_envs/utils/detpmp_env_wrapper.py +++ /dev/null @@ -1,88 +0,0 @@ -from alr_envs.utils.policies import get_policy_class -from mp_lib import det_promp -import numpy as np -import gym - - -class DetPMPEnvWrapper(gym.Wrapper): - def __init__(self, - env, - num_dof, - num_basis, - width, - off=0.01, - start_pos=None, - duration=1, - dt=0.01, - post_traj_time=0., - policy_type=None, - weights_scale=1, - zero_start=False, - zero_goal=False, - ): - super(DetPMPEnvWrapper, self).__init__(env) - self.num_dof = num_dof - self.num_basis = num_basis - self.dim = num_dof * num_basis - self.pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=off, - zero_start=zero_start, zero_goal=zero_goal) - weights = np.zeros(shape=(num_basis, num_dof)) - self.pmp.set_weights(duration, weights) - self.weights_scale = weights_scale - - self.duration = duration - self.dt = dt - self.post_traj_steps = int(post_traj_time / dt) - - self.start_pos = start_pos - self.zero_start = zero_start - - policy_class = get_policy_class(policy_type) - self.policy = policy_class(env) - - def __call__(self, params, contexts=None): - params = np.atleast_2d(params) - rewards = [] - infos = [] - for p, c in zip(params, contexts): - reward, info = self.rollout(p, c) - rewards.append(reward) - infos.append(info) - - return np.array(rewards), infos - - def rollout(self, params, context=None, render=False): - """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" - params = np.reshape(params, newshape=(self.num_basis, self.num_dof)) * self.weights_scale - self.pmp.set_weights(self.duration, params) - t, des_pos, des_vel, des_acc = self.pmp.compute_trajectory(1 / self.dt, 1.) - if self.zero_start: - des_pos += self.start_pos[None, :] - - if self.post_traj_steps > 0: - des_pos = np.vstack([des_pos, np.tile(des_pos[-1, :], [self.post_traj_steps, 1])]) - des_vel = np.vstack([des_vel, np.zeros(shape=(self.post_traj_steps, self.num_dof))]) - - self._trajectory = des_pos - self._velocity = des_vel - - rews = [] - infos = [] - - self.env.configure(context) - self.env.reset() - - for t, pos_vel in enumerate(zip(des_pos, des_vel)): - ac = self.policy.get_action(pos_vel[0], pos_vel[1]) - obs, rew, done, info = self.env.step(ac) - rews.append(rew) - infos.append(info) - if render: - self.env.render(mode="human") - if done: - break - - reward = np.sum(rews) - - return reward, info - diff --git a/alr_envs/utils/dmp_env_wrapper.py b/alr_envs/utils/dmp_env_wrapper.py deleted file mode 100644 index 6835d80..0000000 --- a/alr_envs/utils/dmp_env_wrapper.py +++ /dev/null @@ -1,125 +0,0 @@ -from alr_envs.utils.policies import get_policy_class -from mp_lib.phase import ExpDecayPhaseGenerator -from mp_lib.basis import DMPBasisGenerator -from mp_lib import dmps -import numpy as np -import gym - - -class DmpEnvWrapper(gym.Wrapper): - def __init__(self, - env, - num_dof, - num_basis, - start_pos=None, - final_pos=None, - duration=1, - dt=0.01, - alpha_phase=2, - bandwidth_factor=3, - learn_goal=False, - post_traj_time=0., - policy_type=None, - weights_scale=1., - goal_scale=1., - ): - super(DmpEnvWrapper, self).__init__(env) - self.num_dof = num_dof - self.num_basis = num_basis - self.dim = num_dof * num_basis - if learn_goal: - self.dim += num_dof - self.learn_goal = learn_goal - self.duration = duration # seconds - time_steps = int(duration / dt) - self.t = np.linspace(0, duration, time_steps) - self.post_traj_steps = int(post_traj_time / dt) - - phase_generator = ExpDecayPhaseGenerator(alpha_phase=alpha_phase, duration=duration) - basis_generator = DMPBasisGenerator(phase_generator, - duration=duration, - num_basis=self.num_basis, - basis_bandwidth_factor=bandwidth_factor) - - self.dmp = dmps.DMP(num_dof=num_dof, - basis_generator=basis_generator, - phase_generator=phase_generator, - num_time_steps=time_steps, - dt=dt - ) - - self.dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) - - dmp_weights = np.zeros((num_basis, num_dof)) - if learn_goal: - dmp_goal_pos = np.zeros(num_dof) - else: - dmp_goal_pos = final_pos - - self.dmp.set_weights(dmp_weights, dmp_goal_pos) - self.weights_scale = weights_scale - self.goal_scale = goal_scale - - policy_class = get_policy_class(policy_type) - self.policy = policy_class(env) - - def __call__(self, params, contexts=None): - params = np.atleast_2d(params) - rewards = [] - infos = [] - for p, c in zip(params, contexts): - reward, info = self.rollout(p, c) - rewards.append(reward) - infos.append(info) - - return np.array(rewards), infos - - def goal_and_weights(self, params): - if len(params.shape) > 1: - assert params.shape[1] == self.dim - else: - assert len(params) == self.dim - params = np.reshape(params, [1, self.dim]) - - if self.learn_goal: - goal_pos = params[0, -self.num_dof:] - weight_matrix = np.reshape(params[:, :-self.num_dof], [self.num_basis, self.num_dof]) - else: - goal_pos = self.dmp.dmp_goal_pos.flatten() - assert goal_pos is not None - weight_matrix = np.reshape(params, [self.num_basis, self.num_dof]) - - return goal_pos * self.goal_scale, weight_matrix * self.weights_scale - - def rollout(self, params, context=None, render=False): - """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" - goal_pos, weight_matrix = self.goal_and_weights(params) - self.dmp.set_weights(weight_matrix, goal_pos) - trajectory, velocity = self.dmp.reference_trajectory(self.t) - - if self.post_traj_steps > 0: - trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])]) - velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.num_dof))]) - - self._trajectory = trajectory - self._velocity = velocity - - rews = [] - infos = [] - - self.env.configure(context) - self.env.reset() - - for t, pos_vel in enumerate(zip(trajectory, velocity)): - ac = self.policy.get_action(pos_vel[0], pos_vel[1]) - obs, rew, done, info = self.env.step(ac) - rews.append(rew) - infos.append(info) - if render: - self.env.render(mode="human") - if done: - break - - reward = np.sum(rews) - - return reward, info diff --git a/alr_envs/utils/policies.py b/alr_envs/utils/policies.py index e75beac..e8874d6 100644 --- a/alr_envs/utils/policies.py +++ b/alr_envs/utils/policies.py @@ -1,8 +1,10 @@ +from gym import Env + from alr_envs.mujoco.alr_mujoco_env import AlrMujocoEnv class BaseController: - def __init__(self, env: AlrMujocoEnv): + def __init__(self, env: Env): self.env = env def get_action(self, des_pos, des_vel): @@ -20,7 +22,7 @@ class VelController(BaseController): class PDController(BaseController): - def __init__(self, env): + def __init__(self, env: AlrMujocoEnv): self.p_gains = env.p_gains self.d_gains = env.d_gains super(PDController, self).__init__(env) diff --git a/alr_envs/utils/utils.py b/alr_envs/utils/utils.py index 0bca03e..457c31e 100644 --- a/alr_envs/utils/utils.py +++ b/alr_envs/utils/utils.py @@ -11,10 +11,39 @@ def angle_normalize(x, type="deg"): Returns: """ + + if type not in ["deg", "rad"]: raise ValueError(f"Invalid type {type}. Choose one of 'deg' or 'rad'.") + if type == "deg": - return ((x + np.pi) % (2 * np.pi)) - np.pi - elif type == "rad": - two_pi = 2 * np.pi - return x - two_pi * np.floor((x + np.pi) / two_pi) - else: - raise ValueError(f"Invalid type {type}. Choose on of 'deg' or 'rad'.") + x = np.deg2rad(x) # x * pi / 180 + + two_pi = 2 * np.pi + return x - two_pi * np.floor((x + np.pi) / two_pi) + + +def ccw(A, B, C): + return (C[1] - A[1]) * (B[0] - A[0]) - (B[1] - A[1]) * (C[0] - A[0]) > 1e-12 + + +def intersect(A, B, C, D): + """ + Return true if line segments AB and CD intersects + Args: + A: start point line one + B: end point line one + C: start point line two + D: end point line two + + Returns: + + """ + return ccw(A, C, D) != ccw(B, C, D) and ccw(A, B, C) != ccw(A, B, D) + + +def check_self_collision(line_points): + for i, line1 in enumerate(line_points): + for line2 in line_points[i + 2:, :, :]: + # if line1 != line2: + if intersect(line1[0], line1[-1], line2[0], line2[-1]): + return True + return False diff --git a/alr_envs/utils/wrapper/__init__.py b/alr_envs/utils/wrapper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/alr_envs/utils/wrapper/detpmp_wrapper.py b/alr_envs/utils/wrapper/detpmp_wrapper.py new file mode 100644 index 0000000..62b93d5 --- /dev/null +++ b/alr_envs/utils/wrapper/detpmp_wrapper.py @@ -0,0 +1,40 @@ +import gym +import numpy as np +from mp_lib import det_promp + +from alr_envs.utils.wrapper.mp_wrapper import MPWrapper + + +class DetPMPWrapper(MPWrapper): + def __init__(self, env, num_dof, num_basis, width, start_pos=None, duration=1, dt=0.01, post_traj_time=0., + policy_type=None, weights_scale=1, zero_start=False, zero_goal=False, **mp_kwargs): + # self.duration = duration # seconds + + super().__init__(env, num_dof, duration, dt, post_traj_time, policy_type, weights_scale, + num_basis=num_basis, width=width, start_pos=start_pos, zero_start=zero_start, + zero_goal=zero_goal) + + action_bounds = np.inf * np.ones((self.mp.n_basis * self.mp.n_dof)) + self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) + + self.start_pos = start_pos + self.dt = dt + + def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, width: float = None, + start_pos: np.ndarray = None, zero_start: bool = False, zero_goal: bool = False): + pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=0.01, + zero_start=zero_start, zero_goal=zero_goal) + + weights = np.zeros(shape=(num_basis, num_dof)) + pmp.set_weights(duration, weights) + + return pmp + + def mp_rollout(self, action): + params = np.reshape(action, (self.mp.n_basis, self.mp.n_dof)) * self.weights_scale + self.mp.set_weights(self.duration, params) + _, des_pos, des_vel, _ = self.mp.compute_trajectory(1 / self.dt, 1.) + if self.mp.zero_start: + des_pos += self.start_pos[None, :] + + return des_pos, des_vel diff --git a/alr_envs/utils/wrapper/dmp_wrapper.py b/alr_envs/utils/wrapper/dmp_wrapper.py new file mode 100644 index 0000000..c5c8c8e --- /dev/null +++ b/alr_envs/utils/wrapper/dmp_wrapper.py @@ -0,0 +1,81 @@ +from alr_envs.utils.policies import get_policy_class +from mp_lib.phase import ExpDecayPhaseGenerator +from mp_lib.basis import DMPBasisGenerator +from mp_lib import dmps +import numpy as np +import gym + +from alr_envs.utils.wrapper.mp_wrapper import MPWrapper + + +class DmpWrapper(MPWrapper): + + def __init__(self, env: gym.Env, num_dof: int, num_basis: int, start_pos: np.ndarray = None, + final_pos: np.ndarray = None, duration: int = 1, alpha_phase: float = 2., dt: float = 0.01, + learn_goal: bool = False, post_traj_time: float = 0., policy_type: str = None, + weights_scale: float = 1., goal_scale: float = 1.): + + """ + This Wrapper generates a trajectory based on a DMP and will only return episodic performances. + Args: + env: + num_dof: + num_basis: + start_pos: + final_pos: + duration: + alpha_phase: + dt: + learn_goal: + post_traj_time: + policy_type: + weights_scale: + goal_scale: + """ + self.learn_goal = learn_goal + self.t = np.linspace(0, duration, int(duration / dt)) + self.goal_scale = goal_scale + + super().__init__(env, num_dof, duration, dt, post_traj_time, policy_type, weights_scale, + num_basis=num_basis, start_pos=start_pos, final_pos=final_pos, alpha_phase=alpha_phase) + + action_bounds = np.inf * np.ones((np.prod(self.mp.dmp_weights.shape) + (num_dof if learn_goal else 0))) + self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) + + def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, start_pos: np.ndarray = None, + final_pos: np.ndarray = None, alpha_phase: float = 2.): + + phase_generator = ExpDecayPhaseGenerator(alpha_phase=alpha_phase, duration=duration) + basis_generator = DMPBasisGenerator(phase_generator, duration=duration, num_basis=num_basis) + + dmp = dmps.DMP(num_dof=num_dof, basis_generator=basis_generator, phase_generator=phase_generator, + num_time_steps=int(duration / dt), dt=dt) + + dmp.dmp_start_pos = start_pos.reshape((1, num_dof)) + + weights = np.zeros((num_basis, num_dof)) + goal_pos = np.zeros(num_dof) if self.learn_goal else final_pos + + dmp.set_weights(weights, goal_pos) + return dmp + + def goal_and_weights(self, params): + assert params.shape[-1] == self.action_space.shape[0] + params = np.atleast_2d(params) + + if self.learn_goal: + goal_pos = params[0, -self.mp.num_dimensions:] # [num_dof] + params = params[:, :-self.mp.num_dimensions] # [1,num_dof] + # weight_matrix = np.reshape(params[:, :-self.num_dof], [self.num_basis, self.num_dof]) + else: + goal_pos = self.mp.dmp_goal_pos.flatten() + assert goal_pos is not None + # weight_matrix = np.reshape(params, [self.num_basis, self.num_dof]) + + weight_matrix = np.reshape(params, self.mp.dmp_weights.shape) + return goal_pos * self.goal_scale, weight_matrix * self.weights_scale + + def mp_rollout(self, action): + goal_pos, weight_matrix = self.goal_and_weights(action) + self.mp.set_weights(weight_matrix, goal_pos) + return self.mp.reference_trajectory(self.t) diff --git a/alr_envs/utils/wrapper/mp_wrapper.py b/alr_envs/utils/wrapper/mp_wrapper.py new file mode 100644 index 0000000..f705643 --- /dev/null +++ b/alr_envs/utils/wrapper/mp_wrapper.py @@ -0,0 +1,109 @@ +from abc import ABC, abstractmethod +from collections import defaultdict + +import gym +import numpy as np + +from alr_envs.utils.policies import get_policy_class + + +class MPWrapper(gym.Wrapper, ABC): + + def __init__(self, + env: gym.Env, + num_dof: int, + duration: int = 1, + dt: float = 0.01, + # learn_goal: bool = False, + post_traj_time: float = 0., + policy_type: str = None, + weights_scale: float = 1., + **mp_kwargs + + ): + super().__init__(env) + + # self.num_dof = num_dof + # self.num_basis = num_basis + # self.duration = duration # seconds + self.post_traj_steps = int(post_traj_time / dt) + + self.mp = self.initialize_mp(num_dof, duration, dt, **mp_kwargs) + self.weights_scale = weights_scale + + policy_class = get_policy_class(policy_type) + self.policy = policy_class(env) + + # rendering + self.render_mode = None + self.render_kwargs = None + + def step(self, action: np.ndarray): + """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" + trajectory, velocity = self.mp_rollout(action) + + if self.post_traj_steps > 0: + trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])]) + velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.dmp.num_dimensions))]) + + # self._trajectory = trajectory + # self._velocity = velocity + + rewards = 0 + # infos = defaultdict(list) + + # TODO: @Max Why do we need this configure, states should be part of the model + # self.env.configure(context) + obs = self.env.reset() + info = {} + + for t, pos_vel in enumerate(zip(trajectory, velocity)): + ac = self.policy.get_action(pos_vel[0], pos_vel[1]) + obs, rew, done, info = self.env.step(ac) + rewards += rew + # TODO return all dicts? + # [infos[k].append(v) for k, v in info.items()] + if self.render_mode: + self.env.render(mode=self.render_mode, **self.render_kwargs) + if done: + break + + done = True + return obs, rewards, done, info + + def render(self, mode='human', **kwargs): + """Only set render options here, such that they can be used during the rollout. + This only needs to be called once""" + self.render_mode = mode + self.render_kwargs = kwargs + + def __call__(self, actions): + return self.step(actions) + # params = np.atleast_2d(params) + # rewards = [] + # infos = [] + # for p, c in zip(params, contexts): + # reward, info = self.rollout(p, c) + # rewards.append(reward) + # infos.append(info) + # + # return np.array(rewards), infos + + @abstractmethod + def mp_rollout(self, action): + """ + Generate trajectory and velocity based on the MP + Returns: + trajectory/positions, velocity + """ + raise NotImplementedError() + + @abstractmethod + def initialize_mp(self, num_dof: int, duration: int, dt: float, **kwargs): + """ + Create respective instance of MP + Returns: + MP instance + """ + + raise NotImplementedError diff --git a/dmp_env_wrapper_example.py b/dmp_env_wrapper_example.py index bb886c6..38d367f 100644 --- a/dmp_env_wrapper_example.py +++ b/dmp_env_wrapper_example.py @@ -1,11 +1,9 @@ from alr_envs.classic_control.utils import make_viapointreacher_env -from alr_envs.classic_control.utils import make_holereacher_env, make_holereacher_fix_goal_env, make_holereacher_env_pmp +from alr_envs.classic_control.utils import make_holereacher_env, make_holereacher_fix_goal_env from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv import numpy as np - if __name__ == "__main__": - n_samples = 1 n_cpus = 4 dim = 30 @@ -15,16 +13,13 @@ if __name__ == "__main__": test_env = make_holereacher_env(0)() - # params = 1 * np.random.randn(dim) - params = np.array([ -1.09434772, 7.09294269, 0.98756352, 1.61950682, - 2.66567135, 1.71267901, 8.20010847, 2.50496653, - -0.34886972, 2.07807773, 8.68615904, 3.66578556, - 5.24572097, -3.21506848, -0.28593896, 17.03756855, - -5.88445032, 6.02197609, -3.73457261, -4.24772663, - 8.69382861, -10.99939646, 5.31356886, 8.57420996, - 1.05616879, 19.79831628, -23.53288774, -3.32974082, - -5.86463784, -9.68133089]) - + # params = np.random.randn(n_samples, dim) + params = np.array([[1.386102, -3.29980525, 4.70402733, 1.3966668, 0.73774902, + 3.14676681, -4.98644416, 6.20303193, 1.30502127, -0.09330522, + 7.62656797, -5.76893033, 3.4706711, -0.6944142, -3.33442788, + 12.31421548, -0.72760271, -6.9090723, 7.02903814, -8.7236836, + 1.4805914, 0.53185824, -5.46626893, 0.69692163, 13.58472666, + 0.77199316, 2.02906724, -3.0203244, -1.00533159, -0.57417351]]) # params = np.hstack([50 * np.random.randn(n_samples, 25), np.tile(np.array([np.pi/2, -np.pi/4, -np.pi/4, -np.pi/4, -np.pi/4]), [n_samples, 1])]) diff --git a/example.py b/example.py index fd11c73..0ce713f 100644 --- a/example.py +++ b/example.py @@ -1,20 +1,86 @@ -import time +from collections import defaultdict import gym +import numpy as np -if __name__ == '__main__': +def example_mujoco(): env = gym.make('alr_envs:ALRReacher-v0') - # env = gym.make('alr_envs:SimpleReacher-v0') - # env = gym.make('alr_envs:ALRReacher7-v0') - state = env.reset() + rewards = 0 + obs = env.reset() + # number of environment steps for i in range(10000): - state, reward, done, info = env.step(env.action_space.sample()) + obs, reward, done, info = env.step(env.action_space.sample()) + rewards += reward + if i % 1 == 0: env.render() - # if done: - state = env.reset() + if done: + print(rewards) + rewards = 0 + obs = env.reset() - time.sleep(0.5) + +def example_dmp(): + # env = gym.make("alr_envs:ViaPointReacherDMP-v0") + env = gym.make("alr_envs:HoleReacherDMP-v0") + rewards = 0 + # env.render(mode=None) + obs = env.reset() + + # number of samples/full trajectories (multiple environment steps) + for i in range(10): + obs, reward, done, info = env.step(env.action_space.sample()) + rewards += reward + + if i % 1 == 0: + # render full DMP trajectory + # render can only be called once in the beginning as well. That would render every trajectory + # Calling it after every trajectory allows to modify the mode. mode=None, disables rendering. + env.render(mode="human") + + if done: + print(rewards) + rewards = 0 + obs = env.reset() + + +def example_async(n_cpu=4, seed=int('533D', 16)): + def make_env(env_id, seed, rank): + env = gym.make(env_id) + env.seed(seed + rank) + return lambda: env + + def sample(env: gym.vector.VectorEnv, n_samples=100): + # for plotting + rewards = np.zeros(n_cpu) + + # this would generate more samples than requested if n_samples % num_envs != 0 + repeat = int(np.ceil(n_samples / env.num_envs)) + vals = defaultdict(list) + for i in range(repeat): + obs, reward, done, info = envs.step(envs.action_space.sample()) + vals['obs'].append(obs) + vals['reward'].append(reward) + vals['done'].append(done) + vals['info'].append(info) + rewards += reward + if np.any(done): + print(rewards[done]) + rewards[done] = 0 + + # do not return values above threshold + return (*map(lambda v: np.stack(v)[:n_samples], vals.values()),) + + envs = gym.vector.AsyncVectorEnv([make_env("alr_envs:HoleReacherDMP-v0", seed, i) for i in range(n_cpu)]) + + obs = envs.reset() + print(sample(envs, 16)) + + +if __name__ == '__main__': + # example_mujoco() + # example_dmp() + example_async() diff --git a/setup.py b/setup.py index 88324de..22e1d1c 100644 --- a/setup.py +++ b/setup.py @@ -2,6 +2,9 @@ from setuptools import setup setup(name='alr_envs', version='0.0.1', - install_requires=['gym', 'PyQt5', 'matplotlib', - 'mp_lib @ git+https://git@github.com/maxhuettenrauch/mp_lib@master#egg=mp_lib',], # And any other dependencies foo needs + install_requires=['gym', + 'PyQt5', + 'matplotlib', + 'mp_lib @ git+https://git@github.com/maxhuettenrauch/mp_lib@master#egg=mp_lib', + 'mujoco_py'], # And any other dependencies foo needs )