changed from step to rollout method
This commit is contained in:
parent
a8fcbd6fb0
commit
104281fe16
@ -1,5 +1,7 @@
|
|||||||
import gym
|
import gym
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import matplotlib
|
||||||
|
matplotlib.use('TkAgg')
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
from matplotlib import patches
|
from matplotlib import patches
|
||||||
|
|
||||||
@ -35,6 +37,7 @@ class HoleReacher(gym.Env):
|
|||||||
self._angle_velocity = None
|
self._angle_velocity = None
|
||||||
self.start_pos = np.hstack([[np.pi/2], np.zeros(self.num_links - 1)])
|
self.start_pos = np.hstack([[np.pi/2], np.zeros(self.num_links - 1)])
|
||||||
self.start_vel = np.zeros(self.num_links)
|
self.start_vel = np.zeros(self.num_links)
|
||||||
|
self.weight_matrix_scale = 50 # for the holereacher, the dmp weights become quite large compared to the values of the goal attractor. this scaling is to ensure they are on similar scale for the optimizer
|
||||||
|
|
||||||
self._dt = 0.01
|
self._dt = 0.01
|
||||||
|
|
||||||
@ -66,37 +69,6 @@ class HoleReacher(gym.Env):
|
|||||||
def end_effector(self):
|
def end_effector(self):
|
||||||
return self._joints[self.num_links].T
|
return self._joints[self.num_links].T
|
||||||
|
|
||||||
def step(self, action):
|
|
||||||
# vel = (action - self._joint_angles) / self._dt
|
|
||||||
# acc = (vel - self._angle_velocity) / self._dt
|
|
||||||
# self._angle_velocity = vel
|
|
||||||
# self._joint_angles = action
|
|
||||||
|
|
||||||
vel = action
|
|
||||||
acc = (vel - self._angle_velocity) / self._dt
|
|
||||||
self._angle_velocity = vel
|
|
||||||
self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
|
|
||||||
|
|
||||||
self._update_joints()
|
|
||||||
|
|
||||||
rew = self._reward()
|
|
||||||
|
|
||||||
rew -= 1e-6 * np.sum(acc**2)
|
|
||||||
|
|
||||||
if self._steps == 180:
|
|
||||||
rew -= (0.1 * np.sum(vel**2) ** 2
|
|
||||||
+ 1e-3 * np.sum(action**2)
|
|
||||||
)
|
|
||||||
|
|
||||||
if self._is_collided:
|
|
||||||
rew -= self.collision_penalty
|
|
||||||
|
|
||||||
info = {}
|
|
||||||
|
|
||||||
self._steps += 1
|
|
||||||
|
|
||||||
return self._get_obs().copy(), rew, self._is_collided, info
|
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self._joint_angles = self.start_pos
|
self._joint_angles = self.start_pos
|
||||||
self._angle_velocity = self.start_vel
|
self._angle_velocity = self.start_vel
|
||||||
@ -106,6 +78,46 @@ class HoleReacher(gym.Env):
|
|||||||
|
|
||||||
return self._get_obs().copy()
|
return self._get_obs().copy()
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
|
"""
|
||||||
|
a single step with an action in joint velocity space
|
||||||
|
"""
|
||||||
|
vel = action
|
||||||
|
acc = (vel - self._angle_velocity) / self._dt
|
||||||
|
self._angle_velocity = vel
|
||||||
|
self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
|
||||||
|
|
||||||
|
self._update_joints()
|
||||||
|
|
||||||
|
# rew = self._reward()
|
||||||
|
|
||||||
|
# compute reward directly in step function
|
||||||
|
|
||||||
|
dist_reward = 0
|
||||||
|
if not self._is_collided:
|
||||||
|
if self._steps == 180:
|
||||||
|
dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
|
||||||
|
else:
|
||||||
|
dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
|
||||||
|
|
||||||
|
reward = - dist_reward ** 2
|
||||||
|
|
||||||
|
reward -= 1e-6 * np.sum(acc**2)
|
||||||
|
|
||||||
|
if self._steps == 180:
|
||||||
|
reward -= (0.1 * np.sum(vel**2) ** 2
|
||||||
|
+ 1e-3 * np.sum(action**2)
|
||||||
|
)
|
||||||
|
|
||||||
|
if self._is_collided:
|
||||||
|
reward -= self.collision_penalty
|
||||||
|
|
||||||
|
info = {}
|
||||||
|
|
||||||
|
self._steps += 1
|
||||||
|
|
||||||
|
return self._get_obs().copy(), reward, self._is_collided, info
|
||||||
|
|
||||||
def _update_joints(self):
|
def _update_joints(self):
|
||||||
"""
|
"""
|
||||||
update _joints to get new end effector position. The other links are only required for rendering.
|
update _joints to get new end effector position. The other links are only required for rendering.
|
||||||
@ -140,18 +152,17 @@ class HoleReacher(gym.Env):
|
|||||||
self._steps
|
self._steps
|
||||||
])
|
])
|
||||||
|
|
||||||
def _reward(self):
|
# def _reward(self):
|
||||||
dist_reward = 0
|
# dist_reward = 0
|
||||||
if not self._is_collided:
|
# if not self._is_collided:
|
||||||
if self._steps == 180:
|
# if self._steps == 180:
|
||||||
dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
|
# dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
|
||||||
else:
|
# else:
|
||||||
dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
|
# dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
|
||||||
|
#
|
||||||
# TODO: make negative
|
# out = - dist_reward ** 2
|
||||||
out = - dist_reward ** 2
|
#
|
||||||
|
# return out
|
||||||
return out
|
|
||||||
|
|
||||||
def get_forward_kinematics(self, num_points_per_link=1):
|
def get_forward_kinematics(self, num_points_per_link=1):
|
||||||
theta = self._joint_angles[:, None]
|
theta = self._joint_angles[:, None]
|
||||||
@ -220,8 +231,8 @@ class HoleReacher(gym.Env):
|
|||||||
def render(self, mode='human'):
|
def render(self, mode='human'):
|
||||||
if self.fig is None:
|
if self.fig is None:
|
||||||
self.fig = plt.figure()
|
self.fig = plt.figure()
|
||||||
plt.ion()
|
# plt.ion()
|
||||||
plt.pause(0.01)
|
# plt.pause(0.01)
|
||||||
else:
|
else:
|
||||||
plt.figure(self.fig.number)
|
plt.figure(self.fig.number)
|
||||||
|
|
||||||
@ -242,25 +253,29 @@ class HoleReacher(gym.Env):
|
|||||||
plt.pause(1e-4) # pushes window to foreground, which is annoying.
|
plt.pause(1e-4) # pushes window to foreground, which is annoying.
|
||||||
# self.fig.canvas.flush_events()
|
# self.fig.canvas.flush_events()
|
||||||
|
|
||||||
elif render_partial:
|
elif mode == "partial":
|
||||||
if t == 0:
|
if self._steps == 1:
|
||||||
fig, ax = plt.subplots()
|
# fig, ax = plt.subplots()
|
||||||
# Add the patch to the Axes
|
# Add the patch to the Axes
|
||||||
[plt.gca().add_patch(rect) for rect in self.patches]
|
[plt.gca().add_patch(rect) for rect in self.patches]
|
||||||
|
# plt.pause(0.01)
|
||||||
|
|
||||||
plt.xlim(-self.num_links, self.num_links), plt.ylim(-1, self.num_links)
|
if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided:
|
||||||
|
# Arm
|
||||||
if t % 20 == 0 or t == 199 or is_collided:
|
plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k', alpha=self._steps / 200)
|
||||||
ax.plot(line_points_in_taskspace[:, 0, 0],
|
# ax.plot(line_points_in_taskspace[:, 0, 0],
|
||||||
line_points_in_taskspace[:, 0, 1],
|
# line_points_in_taskspace[:, 0, 1],
|
||||||
line_points_in_taskspace[:, -1, 0],
|
# line_points_in_taskspace[:, -1, 0],
|
||||||
line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200)
|
# line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200)
|
||||||
|
|
||||||
|
lim = np.sum(self.link_lengths) + 0.5
|
||||||
|
plt.xlim([-lim, lim])
|
||||||
|
plt.ylim([-1.1, lim])
|
||||||
plt.pause(0.01)
|
plt.pause(0.01)
|
||||||
|
|
||||||
elif render_final:
|
elif mode == "final":
|
||||||
if self._steps == 0 or self._is_collided:
|
if self._steps == 199 or self._is_collided:
|
||||||
fig, ax = plt.subplots()
|
# fig, ax = plt.subplots()
|
||||||
|
|
||||||
# Add the patch to the Axes
|
# Add the patch to the Axes
|
||||||
[plt.gca().add_patch(rect) for rect in self.patches]
|
[plt.gca().add_patch(rect) for rect in self.patches]
|
||||||
@ -274,16 +289,18 @@ class HoleReacher(gym.Env):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
nl = 5
|
nl = 5
|
||||||
|
render_mode = "human" # "human" or "partial" or "final"
|
||||||
env = HoleReacher(num_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, hole_depth=1, hole_x=1)
|
env = HoleReacher(num_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, hole_depth=1, hole_x=1)
|
||||||
env.reset()
|
env.reset()
|
||||||
|
# env.render(mode=render_mode)
|
||||||
|
|
||||||
for i in range(200):
|
for i in range(200):
|
||||||
# objective.load_result("/tmp/cma")
|
# objective.load_result("/tmp/cma")
|
||||||
# explore = np.random.multivariate_normal(mean=np.zeros(30), cov=1 * np.eye(30))
|
# test with random actions
|
||||||
ac = 11 * env.action_space.sample()
|
ac = 2 * env.action_space.sample()
|
||||||
# ac[0] += np.pi/2
|
# ac[0] += np.pi/2
|
||||||
obs, rew, done, info = env.step(ac)
|
obs, rew, done, info = env.step(ac)
|
||||||
env.render(mode="render_full")
|
env.render(mode=render_mode)
|
||||||
|
|
||||||
print(rew)
|
print(rew)
|
||||||
|
|
||||||
|
@ -4,7 +4,6 @@ from gym.vector.utils import concatenate, create_empty_array
|
|||||||
from gym.vector.async_vector_env import AsyncState
|
from gym.vector.async_vector_env import AsyncState
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import multiprocessing as mp
|
import multiprocessing as mp
|
||||||
from copy import deepcopy
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
@ -41,7 +40,6 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
|
|||||||
'for a pending call to `{0}` to complete.'.format(
|
'for a pending call to `{0}` to complete.'.format(
|
||||||
self._state.value), self._state.value)
|
self._state.value), self._state.value)
|
||||||
|
|
||||||
# split_actions = np.array_split(actions, self.num_envs)
|
|
||||||
actions = np.atleast_2d(actions)
|
actions = np.atleast_2d(actions)
|
||||||
split_actions = np.array_split(actions, np.minimum(len(actions), self.num_envs))
|
split_actions = np.array_split(actions, np.minimum(len(actions), self.num_envs))
|
||||||
for pipe, action in zip(self.parent_pipes, split_actions):
|
for pipe, action in zip(self.parent_pipes, split_actions):
|
||||||
@ -89,6 +87,8 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
|
|||||||
|
|
||||||
observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)]
|
observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)]
|
||||||
|
|
||||||
|
# for now, we ignore the observations and only return the rewards
|
||||||
|
|
||||||
# if not self.shared_memory:
|
# if not self.shared_memory:
|
||||||
# self.observations = concatenate(observations_list, self.observations,
|
# self.observations = concatenate(observations_list, self.observations,
|
||||||
# self.single_observation_space)
|
# self.single_observation_space)
|
||||||
@ -124,8 +124,7 @@ def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
|
|||||||
dones = []
|
dones = []
|
||||||
infos = []
|
infos = []
|
||||||
for d in data:
|
for d in data:
|
||||||
env.reset()
|
observation, reward, done, info = env.rollout(d)
|
||||||
observation, reward, done, info = env.step(d)
|
|
||||||
observations.append(observation)
|
observations.append(observation)
|
||||||
rewards.append(reward)
|
rewards.append(reward)
|
||||||
dones.append(done)
|
dones.append(done)
|
||||||
|
@ -14,7 +14,7 @@ class DmpEnvWrapperBase(gym.Wrapper):
|
|||||||
if learn_goal:
|
if learn_goal:
|
||||||
self.dim += num_dof
|
self.dim += num_dof
|
||||||
self.learn_goal = True
|
self.learn_goal = True
|
||||||
duration = duration # seconds
|
self.duration = duration # seconds
|
||||||
time_steps = int(duration / dt)
|
time_steps = int(duration / dt)
|
||||||
self.t = np.linspace(0, duration, time_steps)
|
self.t = np.linspace(0, duration, time_steps)
|
||||||
|
|
||||||
@ -35,6 +35,21 @@ class DmpEnvWrapperBase(gym.Wrapper):
|
|||||||
|
|
||||||
self.dmp.set_weights(dmp_weights, dmp_goal_pos)
|
self.dmp.set_weights(dmp_weights, dmp_goal_pos)
|
||||||
|
|
||||||
|
def __call__(self, params):
|
||||||
|
params = np.atleast_2d(params)
|
||||||
|
observations = []
|
||||||
|
rewards = []
|
||||||
|
dones = []
|
||||||
|
infos = []
|
||||||
|
for p in params:
|
||||||
|
observation, reward, done, info = self.rollout(p)
|
||||||
|
observations.append(observation)
|
||||||
|
rewards.append(reward)
|
||||||
|
dones.append(done)
|
||||||
|
infos.append(info)
|
||||||
|
|
||||||
|
return np.array(rewards)
|
||||||
|
|
||||||
def goal_and_weights(self, params):
|
def goal_and_weights(self, params):
|
||||||
if len(params.shape) > 1:
|
if len(params.shape) > 1:
|
||||||
assert params.shape[1] == self.dim
|
assert params.shape[1] == self.dim
|
||||||
@ -51,19 +66,26 @@ class DmpEnvWrapperBase(gym.Wrapper):
|
|||||||
|
|
||||||
return goal_pos, weight_matrix
|
return goal_pos, weight_matrix
|
||||||
|
|
||||||
def step(self, action, render=False):
|
def rollout(self, params, render=False):
|
||||||
""" overwrite step function where action now is the weights and possible goal position"""
|
""" This function generates a trajectory based on a DMP and then does the usual loop over reset and step"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class DmpEnvWrapperAngle(DmpEnvWrapperBase):
|
class DmpEnvWrapperAngle(DmpEnvWrapperBase):
|
||||||
def step(self, action, render=False):
|
"""
|
||||||
|
Wrapper for gym environments which creates a trajectory in joint angle space
|
||||||
|
"""
|
||||||
|
def rollout(self, action, render=False):
|
||||||
goal_pos, weight_matrix = self.goal_and_weights(action)
|
goal_pos, weight_matrix = self.goal_and_weights(action)
|
||||||
|
if hasattr(self.env, "weight_matrix_scale"):
|
||||||
|
weight_matrix = weight_matrix * self.env.weight_matrix_scale
|
||||||
self.dmp.set_weights(weight_matrix, goal_pos)
|
self.dmp.set_weights(weight_matrix, goal_pos)
|
||||||
trajectory, velocities = self.dmp.reference_trajectory(self.t)
|
trajectory, velocities = self.dmp.reference_trajectory(self.t)
|
||||||
|
|
||||||
rews = []
|
rews = []
|
||||||
|
|
||||||
|
self.env.reset()
|
||||||
|
|
||||||
for t, traj in enumerate(trajectory):
|
for t, traj in enumerate(trajectory):
|
||||||
obs, rew, done, info = self.env.step(traj)
|
obs, rew, done, info = self.env.step(traj)
|
||||||
rews.append(rew)
|
rews.append(rew)
|
||||||
@ -73,21 +95,27 @@ class DmpEnvWrapperAngle(DmpEnvWrapperBase):
|
|||||||
break
|
break
|
||||||
|
|
||||||
reward = np.sum(rews)
|
reward = np.sum(rews)
|
||||||
done = True
|
# done = True
|
||||||
info = {}
|
info = {}
|
||||||
|
|
||||||
return obs, reward, done, info
|
return obs, reward, done, info
|
||||||
|
|
||||||
|
|
||||||
class DmpEnvWrapperVel(DmpEnvWrapperBase):
|
class DmpEnvWrapperVel(DmpEnvWrapperBase):
|
||||||
def step(self, action, render=False):
|
"""
|
||||||
|
Wrapper for gym environments which creates a trajectory in joint velocity space
|
||||||
|
"""
|
||||||
|
def rollout(self, action, render=False):
|
||||||
goal_pos, weight_matrix = self.goal_and_weights(action)
|
goal_pos, weight_matrix = self.goal_and_weights(action)
|
||||||
weight_matrix *= 50
|
if hasattr(self.env, "weight_matrix_scale"):
|
||||||
|
weight_matrix = weight_matrix * self.env.weight_matrix_scale
|
||||||
self.dmp.set_weights(weight_matrix, goal_pos)
|
self.dmp.set_weights(weight_matrix, goal_pos)
|
||||||
trajectory, velocities = self.dmp.reference_trajectory(self.t)
|
trajectory, velocities = self.dmp.reference_trajectory(self.t)
|
||||||
|
|
||||||
rews = []
|
rews = []
|
||||||
|
|
||||||
|
self.env.reset()
|
||||||
|
|
||||||
for t, vel in enumerate(velocities):
|
for t, vel in enumerate(velocities):
|
||||||
obs, rew, done, info = self.env.step(vel)
|
obs, rew, done, info = self.env.step(vel)
|
||||||
rews.append(rew)
|
rews.append(rew)
|
||||||
|
2
setup.py
2
setup.py
@ -2,5 +2,5 @@ from setuptools import setup
|
|||||||
|
|
||||||
setup(name='alr_envs',
|
setup(name='alr_envs',
|
||||||
version='0.0.1',
|
version='0.0.1',
|
||||||
install_requires=['gym', 'PyQt5'] # And any other dependencies foo needs
|
install_requires=['gym', 'PyQt5', 'matplotlib'] # And any other dependencies foo needs
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user