changed from step to rollout method
This commit is contained in:
parent
a8fcbd6fb0
commit
104281fe16
@ -1,5 +1,7 @@
|
||||
import gym
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
matplotlib.use('TkAgg')
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib import patches
|
||||
|
||||
@ -35,6 +37,7 @@ class HoleReacher(gym.Env):
|
||||
self._angle_velocity = None
|
||||
self.start_pos = np.hstack([[np.pi/2], np.zeros(self.num_links - 1)])
|
||||
self.start_vel = np.zeros(self.num_links)
|
||||
self.weight_matrix_scale = 50 # for the holereacher, the dmp weights become quite large compared to the values of the goal attractor. this scaling is to ensure they are on similar scale for the optimizer
|
||||
|
||||
self._dt = 0.01
|
||||
|
||||
@ -66,37 +69,6 @@ class HoleReacher(gym.Env):
|
||||
def end_effector(self):
|
||||
return self._joints[self.num_links].T
|
||||
|
||||
def step(self, action):
|
||||
# vel = (action - self._joint_angles) / self._dt
|
||||
# acc = (vel - self._angle_velocity) / self._dt
|
||||
# self._angle_velocity = vel
|
||||
# self._joint_angles = action
|
||||
|
||||
vel = action
|
||||
acc = (vel - self._angle_velocity) / self._dt
|
||||
self._angle_velocity = vel
|
||||
self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
|
||||
|
||||
self._update_joints()
|
||||
|
||||
rew = self._reward()
|
||||
|
||||
rew -= 1e-6 * np.sum(acc**2)
|
||||
|
||||
if self._steps == 180:
|
||||
rew -= (0.1 * np.sum(vel**2) ** 2
|
||||
+ 1e-3 * np.sum(action**2)
|
||||
)
|
||||
|
||||
if self._is_collided:
|
||||
rew -= self.collision_penalty
|
||||
|
||||
info = {}
|
||||
|
||||
self._steps += 1
|
||||
|
||||
return self._get_obs().copy(), rew, self._is_collided, info
|
||||
|
||||
def reset(self):
|
||||
self._joint_angles = self.start_pos
|
||||
self._angle_velocity = self.start_vel
|
||||
@ -106,6 +78,46 @@ class HoleReacher(gym.Env):
|
||||
|
||||
return self._get_obs().copy()
|
||||
|
||||
def step(self, action):
|
||||
"""
|
||||
a single step with an action in joint velocity space
|
||||
"""
|
||||
vel = action
|
||||
acc = (vel - self._angle_velocity) / self._dt
|
||||
self._angle_velocity = vel
|
||||
self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
|
||||
|
||||
self._update_joints()
|
||||
|
||||
# rew = self._reward()
|
||||
|
||||
# compute reward directly in step function
|
||||
|
||||
dist_reward = 0
|
||||
if not self._is_collided:
|
||||
if self._steps == 180:
|
||||
dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
|
||||
else:
|
||||
dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
|
||||
|
||||
reward = - dist_reward ** 2
|
||||
|
||||
reward -= 1e-6 * np.sum(acc**2)
|
||||
|
||||
if self._steps == 180:
|
||||
reward -= (0.1 * np.sum(vel**2) ** 2
|
||||
+ 1e-3 * np.sum(action**2)
|
||||
)
|
||||
|
||||
if self._is_collided:
|
||||
reward -= self.collision_penalty
|
||||
|
||||
info = {}
|
||||
|
||||
self._steps += 1
|
||||
|
||||
return self._get_obs().copy(), reward, self._is_collided, info
|
||||
|
||||
def _update_joints(self):
|
||||
"""
|
||||
update _joints to get new end effector position. The other links are only required for rendering.
|
||||
@ -140,18 +152,17 @@ class HoleReacher(gym.Env):
|
||||
self._steps
|
||||
])
|
||||
|
||||
def _reward(self):
|
||||
dist_reward = 0
|
||||
if not self._is_collided:
|
||||
if self._steps == 180:
|
||||
dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
|
||||
else:
|
||||
dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
|
||||
|
||||
# TODO: make negative
|
||||
out = - dist_reward ** 2
|
||||
|
||||
return out
|
||||
# def _reward(self):
|
||||
# dist_reward = 0
|
||||
# if not self._is_collided:
|
||||
# if self._steps == 180:
|
||||
# dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
|
||||
# else:
|
||||
# dist_reward = np.linalg.norm(self.end_effector - self.bottom_center_of_hole)
|
||||
#
|
||||
# out = - dist_reward ** 2
|
||||
#
|
||||
# return out
|
||||
|
||||
def get_forward_kinematics(self, num_points_per_link=1):
|
||||
theta = self._joint_angles[:, None]
|
||||
@ -220,8 +231,8 @@ class HoleReacher(gym.Env):
|
||||
def render(self, mode='human'):
|
||||
if self.fig is None:
|
||||
self.fig = plt.figure()
|
||||
plt.ion()
|
||||
plt.pause(0.01)
|
||||
# plt.ion()
|
||||
# plt.pause(0.01)
|
||||
else:
|
||||
plt.figure(self.fig.number)
|
||||
|
||||
@ -242,25 +253,29 @@ class HoleReacher(gym.Env):
|
||||
plt.pause(1e-4) # pushes window to foreground, which is annoying.
|
||||
# self.fig.canvas.flush_events()
|
||||
|
||||
elif render_partial:
|
||||
if t == 0:
|
||||
fig, ax = plt.subplots()
|
||||
elif mode == "partial":
|
||||
if self._steps == 1:
|
||||
# fig, ax = plt.subplots()
|
||||
# Add the patch to the Axes
|
||||
[plt.gca().add_patch(rect) for rect in self.patches]
|
||||
# plt.pause(0.01)
|
||||
|
||||
plt.xlim(-self.num_links, self.num_links), plt.ylim(-1, self.num_links)
|
||||
|
||||
if t % 20 == 0 or t == 199 or is_collided:
|
||||
ax.plot(line_points_in_taskspace[:, 0, 0],
|
||||
line_points_in_taskspace[:, 0, 1],
|
||||
line_points_in_taskspace[:, -1, 0],
|
||||
line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200)
|
||||
if self._steps % 20 == 0 or self._steps in [1, 199] or self._is_collided:
|
||||
# Arm
|
||||
plt.plot(self._joints[:, 0], self._joints[:, 1], 'ro-', markerfacecolor='k', alpha=self._steps / 200)
|
||||
# ax.plot(line_points_in_taskspace[:, 0, 0],
|
||||
# line_points_in_taskspace[:, 0, 1],
|
||||
# line_points_in_taskspace[:, -1, 0],
|
||||
# line_points_in_taskspace[:, -1, 1], marker='o', color='k', alpha=t / 200)
|
||||
|
||||
lim = np.sum(self.link_lengths) + 0.5
|
||||
plt.xlim([-lim, lim])
|
||||
plt.ylim([-1.1, lim])
|
||||
plt.pause(0.01)
|
||||
|
||||
elif render_final:
|
||||
if self._steps == 0 or self._is_collided:
|
||||
fig, ax = plt.subplots()
|
||||
elif mode == "final":
|
||||
if self._steps == 199 or self._is_collided:
|
||||
# fig, ax = plt.subplots()
|
||||
|
||||
# Add the patch to the Axes
|
||||
[plt.gca().add_patch(rect) for rect in self.patches]
|
||||
@ -274,16 +289,18 @@ class HoleReacher(gym.Env):
|
||||
|
||||
if __name__ == '__main__':
|
||||
nl = 5
|
||||
render_mode = "human" # "human" or "partial" or "final"
|
||||
env = HoleReacher(num_links=nl, allow_self_collision=False, allow_wall_collision=False, hole_width=0.15, hole_depth=1, hole_x=1)
|
||||
env.reset()
|
||||
# env.render(mode=render_mode)
|
||||
|
||||
for i in range(200):
|
||||
# objective.load_result("/tmp/cma")
|
||||
# explore = np.random.multivariate_normal(mean=np.zeros(30), cov=1 * np.eye(30))
|
||||
ac = 11 * env.action_space.sample()
|
||||
# test with random actions
|
||||
ac = 2 * env.action_space.sample()
|
||||
# ac[0] += np.pi/2
|
||||
obs, rew, done, info = env.step(ac)
|
||||
env.render(mode="render_full")
|
||||
env.render(mode=render_mode)
|
||||
|
||||
print(rew)
|
||||
|
||||
|
@ -4,7 +4,6 @@ from gym.vector.utils import concatenate, create_empty_array
|
||||
from gym.vector.async_vector_env import AsyncState
|
||||
import numpy as np
|
||||
import multiprocessing as mp
|
||||
from copy import deepcopy
|
||||
import sys
|
||||
|
||||
|
||||
@ -41,7 +40,6 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
|
||||
'for a pending call to `{0}` to complete.'.format(
|
||||
self._state.value), self._state.value)
|
||||
|
||||
# split_actions = np.array_split(actions, self.num_envs)
|
||||
actions = np.atleast_2d(actions)
|
||||
split_actions = np.array_split(actions, np.minimum(len(actions), self.num_envs))
|
||||
for pipe, action in zip(self.parent_pipes, split_actions):
|
||||
@ -89,6 +87,8 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
|
||||
|
||||
observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)]
|
||||
|
||||
# for now, we ignore the observations and only return the rewards
|
||||
|
||||
# if not self.shared_memory:
|
||||
# self.observations = concatenate(observations_list, self.observations,
|
||||
# self.single_observation_space)
|
||||
@ -124,8 +124,7 @@ def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
|
||||
dones = []
|
||||
infos = []
|
||||
for d in data:
|
||||
env.reset()
|
||||
observation, reward, done, info = env.step(d)
|
||||
observation, reward, done, info = env.rollout(d)
|
||||
observations.append(observation)
|
||||
rewards.append(reward)
|
||||
dones.append(done)
|
||||
|
@ -14,7 +14,7 @@ class DmpEnvWrapperBase(gym.Wrapper):
|
||||
if learn_goal:
|
||||
self.dim += num_dof
|
||||
self.learn_goal = True
|
||||
duration = duration # seconds
|
||||
self.duration = duration # seconds
|
||||
time_steps = int(duration / dt)
|
||||
self.t = np.linspace(0, duration, time_steps)
|
||||
|
||||
@ -35,6 +35,21 @@ class DmpEnvWrapperBase(gym.Wrapper):
|
||||
|
||||
self.dmp.set_weights(dmp_weights, dmp_goal_pos)
|
||||
|
||||
def __call__(self, params):
|
||||
params = np.atleast_2d(params)
|
||||
observations = []
|
||||
rewards = []
|
||||
dones = []
|
||||
infos = []
|
||||
for p in params:
|
||||
observation, reward, done, info = self.rollout(p)
|
||||
observations.append(observation)
|
||||
rewards.append(reward)
|
||||
dones.append(done)
|
||||
infos.append(info)
|
||||
|
||||
return np.array(rewards)
|
||||
|
||||
def goal_and_weights(self, params):
|
||||
if len(params.shape) > 1:
|
||||
assert params.shape[1] == self.dim
|
||||
@ -51,19 +66,26 @@ class DmpEnvWrapperBase(gym.Wrapper):
|
||||
|
||||
return goal_pos, weight_matrix
|
||||
|
||||
def step(self, action, render=False):
|
||||
""" overwrite step function where action now is the weights and possible goal position"""
|
||||
def rollout(self, params, render=False):
|
||||
""" This function generates a trajectory based on a DMP and then does the usual loop over reset and step"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class DmpEnvWrapperAngle(DmpEnvWrapperBase):
|
||||
def step(self, action, render=False):
|
||||
"""
|
||||
Wrapper for gym environments which creates a trajectory in joint angle space
|
||||
"""
|
||||
def rollout(self, action, render=False):
|
||||
goal_pos, weight_matrix = self.goal_and_weights(action)
|
||||
if hasattr(self.env, "weight_matrix_scale"):
|
||||
weight_matrix = weight_matrix * self.env.weight_matrix_scale
|
||||
self.dmp.set_weights(weight_matrix, goal_pos)
|
||||
trajectory, velocities = self.dmp.reference_trajectory(self.t)
|
||||
|
||||
rews = []
|
||||
|
||||
self.env.reset()
|
||||
|
||||
for t, traj in enumerate(trajectory):
|
||||
obs, rew, done, info = self.env.step(traj)
|
||||
rews.append(rew)
|
||||
@ -73,21 +95,27 @@ class DmpEnvWrapperAngle(DmpEnvWrapperBase):
|
||||
break
|
||||
|
||||
reward = np.sum(rews)
|
||||
done = True
|
||||
# done = True
|
||||
info = {}
|
||||
|
||||
return obs, reward, done, info
|
||||
|
||||
|
||||
class DmpEnvWrapperVel(DmpEnvWrapperBase):
|
||||
def step(self, action, render=False):
|
||||
"""
|
||||
Wrapper for gym environments which creates a trajectory in joint velocity space
|
||||
"""
|
||||
def rollout(self, action, render=False):
|
||||
goal_pos, weight_matrix = self.goal_and_weights(action)
|
||||
weight_matrix *= 50
|
||||
if hasattr(self.env, "weight_matrix_scale"):
|
||||
weight_matrix = weight_matrix * self.env.weight_matrix_scale
|
||||
self.dmp.set_weights(weight_matrix, goal_pos)
|
||||
trajectory, velocities = self.dmp.reference_trajectory(self.t)
|
||||
|
||||
rews = []
|
||||
|
||||
self.env.reset()
|
||||
|
||||
for t, vel in enumerate(velocities):
|
||||
obs, rew, done, info = self.env.step(vel)
|
||||
rews.append(rew)
|
||||
|
Loading…
Reference in New Issue
Block a user