changed from step to rollout method

This commit is contained in:
Maximilian Huettenrauch 2021-01-12 10:52:08 +01:00
parent a8fcbd6fb0
commit 104281fe16
4 changed files with 117 additions and 73 deletions

View File

@ -1,5 +1,7 @@
import gym import gym
import numpy as np import numpy as np
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib import patches from matplotlib import patches
@ -35,6 +37,7 @@ class HoleReacher(gym.Env):
self._angle_velocity = None self._angle_velocity = None
self.start_pos = np.hstack([[np.pi/2], np.zeros(self.num_links - 1)]) self.start_pos = np.hstack([[np.pi/2], np.zeros(self.num_links - 1)])
self.start_vel = np.zeros(self.num_links) self.start_vel = np.zeros(self.num_links)
self.weight_matrix_scale = 50 # for the holereacher, the dmp weights become quite large compared to the values of the goal attractor. this scaling is to ensure they are on similar scale for the optimizer
self._dt = 0.01 self._dt = 0.01
@ -66,37 +69,6 @@ class HoleReacher(gym.Env):
def end_effector(self): def end_effector(self):
return self._joints[self.num_links].T return self._joints[self.num_links].T
def step(self, action):
# vel = (action - self._joint_angles) / self._dt
# acc = (vel - self._angle_velocity) / self._dt
# self._angle_velocity = vel
# self._joint_angles = action
vel = action
acc = (vel - self._angle_velocity) / self._dt
self._angle_velocity = vel
self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
self._update_joints()
rew = self._reward()
rew -= 1e-6 * np.sum(acc**2)
if self._steps == 180:
rew -= (0.1 * np.sum(vel**2) ** 2
+ 1e-3 * np.sum(action**2)
)
if self._is_collided:
rew -= self.collision_penalty
info = {}
self._steps += 1
return self._get_obs().copy(), rew, self._is_collided, info
def reset(self): def reset(self):
self._joint_angles = self.start_pos self._joint_angles = self.start_pos
self._angle_velocity = self.start_vel self._angle_velocity = self.start_vel
@ -106,6 +78,46 @@ class HoleReacher(gym.Env):
return self._get_obs().copy() return self._get_obs().copy()
def step(self, action):
"""
a single step with an action in joint velocity space
"""
vel = action
acc = (vel - self._angle_velocity) / self._dt
self._angle_velocity = vel
self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
self._update_joints()
# rew = self._reward()
# compute reward directly in step function
dist_reward = 0
if not self._is_collided:
if self._steps == 180:
dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
else:
dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
reward = - dist_reward ** 2
reward -= 1e-6 * np.sum(acc**2)
if self._steps == 180:
reward -= (0.1 * np.sum(vel**2) ** 2
+ 1e-3 * np.sum(action**2)
)
if self._is_collided:
reward -= self.collision_penalty
info = {}
self._steps += 1
return self._get_obs().copy(), reward, self._is_collided, info
def _update_joints(self): def _update_joints(self):
""" """
update _joints to get new end effector position. The other links are only required for rendering. update _joints to get new end effector position. The other links are only required for rendering.
@ -140,18 +152,17 @@ class HoleReacher(gym.Env):
self._steps self._steps
]) ])
def _reward(self): # def _reward(self):
dist_reward = 0 # dist_reward = 0
if not self._is_collided: # if not self._is_collided:
if self._steps == 180: # if self._steps == 180:
dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) # dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
else: # else:
dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole) # dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
#
# TODO: make negative # out = - dist_reward ** 2
out = - dist_reward ** 2 #
# return out
return out
def get_forward_kinematics(self, num_points_per_link=1): def get_forward_kinematics(self, num_points_per_link=1):
theta = self._joint_angles[:, None] theta = self._joint_angles[:, None]
@ -220,8 +231,8 @@ class HoleReacher(gym.Env):
def render(self, mode='human'): def render(self, mode='human'):
if self.fig is None: if self.fig is None:
self.fig = plt.figure() self.fig = plt.figure()
plt.ion() # plt.ion()
plt.pause(0.01) # plt.pause(0.01)
else: else:
plt.figure(self.fig.number) plt.figure(self.fig.number)
@ -242,25 +253,29 @@ class HoleReacher(gym.Env):
plt.pause(1e-4) # pushes window to foreground, which is annoying. plt.pause(1e-4) # pushes window to foreground, which is annoying.
# self.fig.canvas.flush_events() # self.fig.canvas.flush_events()
elif render_partial: elif mode == "partial":
if t == 0: if self._steps == 1:
fig, ax = plt.subplots() # fig, ax = plt.subplots()
# Add the patch to the Axes # Add the patch to the Axes
[plt.gca().add_patch(rect) for rect in self.patches] [plt.gca().add_patch(rect) for rect in self.patches]
# plt.pause(0.01)
plt.xlim(-self.num_links, self.num_links), plt.ylim(-1, self.num_links) if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided:
# Arm
if t % 20 == 0 or t == 199 or is_collided: plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k', alpha=self._steps / 200)
ax.plot(line_points_in_taskspace[:, 0, 0], # ax.plot(line_points_in_taskspace[:, 0, 0],
line_points_in_taskspace[:, 0, 1], # line_points_in_taskspace[:, 0, 1],
line_points_in_taskspace[:, -1, 0], # line_points_in_taskspace[:, -1, 0],
line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200) # line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200)
lim = np.sum(self.link_lengths) + 0.5
plt.xlim([-lim, lim])
plt.ylim([-1.1, lim])
plt.pause(0.01) plt.pause(0.01)
elif render_final: elif mode == "final":
if self._steps == 0 or self._is_collided: if self._steps == 199 or self._is_collided:
fig, ax = plt.subplots() # fig, ax = plt.subplots()
# Add the patch to the Axes # Add the patch to the Axes
[plt.gca().add_patch(rect) for rect in self.patches] [plt.gca().add_patch(rect) for rect in self.patches]
@ -274,16 +289,18 @@ class HoleReacher(gym.Env):
if __name__ == '__main__': if __name__ == '__main__':
nl = 5 nl = 5
render_mode = "human" # "human" or "partial" or "final"
env = HoleReacher(num_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, hole_depth=1, hole_x=1) env = HoleReacher(num_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, hole_depth=1, hole_x=1)
env.reset() env.reset()
# env.render(mode=render_mode)
for i in range(200): for i in range(200):
# objective.load_result("/tmp/cma") # objective.load_result("/tmp/cma")
# explore = np.random.multivariate_normal(mean=np.zeros(30), cov=1 * np.eye(30)) # test with random actions
ac = 11 * env.action_space.sample() ac = 2 * env.action_space.sample()
# ac[0] += np.pi/2 # ac[0] += np.pi/2
obs, rew, done, info = env.step(ac) obs, rew, done, info = env.step(ac)
env.render(mode="render_full") env.render(mode=render_mode)
print(rew) print(rew)

View File

@ -4,7 +4,6 @@ from gym.vector.utils import concatenate, create_empty_array
from gym.vector.async_vector_env import AsyncState from gym.vector.async_vector_env import AsyncState
import numpy as np import numpy as np
import multiprocessing as mp import multiprocessing as mp
from copy import deepcopy
import sys import sys
@ -41,7 +40,6 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
'for a pending call to `{0}` to complete.'.format( 'for a pending call to `{0}` to complete.'.format(
self._state.value), self._state.value) self._state.value), self._state.value)
# split_actions = np.array_split(actions, self.num_envs)
actions = np.atleast_2d(actions) actions = np.atleast_2d(actions)
split_actions = np.array_split(actions, np.minimum(len(actions), self.num_envs)) split_actions = np.array_split(actions, np.minimum(len(actions), self.num_envs))
for pipe, action in zip(self.parent_pipes, split_actions): for pipe, action in zip(self.parent_pipes, split_actions):
@ -89,6 +87,8 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)] observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)]
# for now, we ignore the observations and only return the rewards
# if not self.shared_memory: # if not self.shared_memory:
# self.observations = concatenate(observations_list, self.observations, # self.observations = concatenate(observations_list, self.observations,
# self.single_observation_space) # self.single_observation_space)
@ -124,8 +124,7 @@ def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
dones = [] dones = []
infos = [] infos = []
for d in data: for d in data:
env.reset() observation, reward, done, info = env.rollout(d)
observation, reward, done, info = env.step(d)
observations.append(observation) observations.append(observation)
rewards.append(reward) rewards.append(reward)
dones.append(done) dones.append(done)

View File

@ -14,7 +14,7 @@ class DmpEnvWrapperBase(gym.Wrapper):
if learn_goal: if learn_goal:
self.dim += num_dof self.dim += num_dof
self.learn_goal = True self.learn_goal = True
duration = duration # seconds self.duration = duration # seconds
time_steps = int(duration / dt) time_steps = int(duration / dt)
self.t = np.linspace(0, duration, time_steps) self.t = np.linspace(0, duration, time_steps)
@ -35,6 +35,21 @@ class DmpEnvWrapperBase(gym.Wrapper):
self.dmp.set_weights(dmp_weights, dmp_goal_pos) self.dmp.set_weights(dmp_weights, dmp_goal_pos)
def __call__(self, params):
params = np.atleast_2d(params)
observations = []
rewards = []
dones = []
infos = []
for p in params:
observation, reward, done, info = self.rollout(p)
observations.append(observation)
rewards.append(reward)
dones.append(done)
infos.append(info)
return np.array(rewards)
def goal_and_weights(self, params): def goal_and_weights(self, params):
if len(params.shape) > 1: if len(params.shape) > 1:
assert params.shape[1] == self.dim assert params.shape[1] == self.dim
@ -51,19 +66,26 @@ class DmpEnvWrapperBase(gym.Wrapper):
return goal_pos, weight_matrix return goal_pos, weight_matrix
def step(self, action, render=False): def rollout(self, params, render=False):
""" overwrite step function where action now is the weights and possible goal position""" """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step"""
raise NotImplementedError raise NotImplementedError
class DmpEnvWrapperAngle(DmpEnvWrapperBase): class DmpEnvWrapperAngle(DmpEnvWrapperBase):
def step(self, action, render=False): """
Wrapper for gym environments which creates a trajectory in joint angle space
"""
def rollout(self, action, render=False):
goal_pos, weight_matrix = self.goal_and_weights(action) goal_pos, weight_matrix = self.goal_and_weights(action)
if hasattr(self.env, "weight_matrix_scale"):
weight_matrix = weight_matrix * self.env.weight_matrix_scale
self.dmp.set_weights(weight_matrix, goal_pos) self.dmp.set_weights(weight_matrix, goal_pos)
trajectory, velocities = self.dmp.reference_trajectory(self.t) trajectory, velocities = self.dmp.reference_trajectory(self.t)
rews = [] rews = []
self.env.reset()
for t, traj in enumerate(trajectory): for t, traj in enumerate(trajectory):
obs, rew, done, info = self.env.step(traj) obs, rew, done, info = self.env.step(traj)
rews.append(rew) rews.append(rew)
@ -73,21 +95,27 @@ class DmpEnvWrapperAngle(DmpEnvWrapperBase):
break break
reward = np.sum(rews) reward = np.sum(rews)
done = True # done = True
info = {} info = {}
return obs, reward, done, info return obs, reward, done, info
class DmpEnvWrapperVel(DmpEnvWrapperBase): class DmpEnvWrapperVel(DmpEnvWrapperBase):
def step(self, action, render=False): """
Wrapper for gym environments which creates a trajectory in joint velocity space
"""
def rollout(self, action, render=False):
goal_pos, weight_matrix = self.goal_and_weights(action) goal_pos, weight_matrix = self.goal_and_weights(action)
weight_matrix *= 50 if hasattr(self.env, "weight_matrix_scale"):
weight_matrix = weight_matrix * self.env.weight_matrix_scale
self.dmp.set_weights(weight_matrix, goal_pos) self.dmp.set_weights(weight_matrix, goal_pos)
trajectory, velocities = self.dmp.reference_trajectory(self.t) trajectory, velocities = self.dmp.reference_trajectory(self.t)
rews = [] rews = []
self.env.reset()
for t, vel in enumerate(velocities): for t, vel in enumerate(velocities):
obs, rew, done, info = self.env.step(vel) obs, rew, done, info = self.env.step(vel)
rews.append(rew) rews.append(rew)

View File

@ -2,5 +2,5 @@ from setuptools import setup
setup(name='alr_envs', setup(name='alr_envs',
version='0.0.1', version='0.0.1',
install_requires=['gym', 'PyQt5'] # And any other dependencies foo needs install_requires=['gym', 'PyQt5', 'matplotlib'] # And any other dependencies foo needs
) )