This commit is contained in:
Maximilian Huettenrauch 2021-02-11 12:32:32 +01:00
parent c81378b9e7
commit 13a292f0e0
10 changed files with 116 additions and 155 deletions

View File

@ -37,7 +37,7 @@ class HoleReacher(gym.Env):
self.start_vel = np.zeros(self.num_links) self.start_vel = np.zeros(self.num_links)
self.weight_matrix_scale = 50 # for the holereacher, the dmp weights become quite large compared to the values of the goal attractor. this scaling is to ensure they are on similar scale for the optimizer self.weight_matrix_scale = 50 # for the holereacher, the dmp weights become quite large compared to the values of the goal attractor. this scaling is to ensure they are on similar scale for the optimizer
self._dt = 0.01 self.dt = 0.01
self.time_limit = 2 self.time_limit = 2
action_bound = np.pi * np.ones((self.num_links,)) action_bound = np.pi * np.ones((self.num_links,))
@ -82,9 +82,9 @@ class HoleReacher(gym.Env):
a single step with an action in joint velocity space a single step with an action in joint velocity space
""" """
vel = action vel = action
acc = (vel - self._angle_velocity) / self._dt acc = (vel - self._angle_velocity) / self.dt
self._angle_velocity = vel self._angle_velocity = vel
self._joint_angles = self._joint_angles + self._dt * self._angle_velocity self._joint_angles = self._joint_angles + self.dt * self._angle_velocity
self._update_joints() self._update_joints()
@ -113,7 +113,7 @@ class HoleReacher(gym.Env):
self._steps += 1 self._steps += 1
done = self._steps * self._dt > self.time_limit or self._is_collided done = self._steps * self.dt > self.time_limit or self._is_collided
return self._get_obs().copy(), reward, done, info return self._get_obs().copy(), reward, done, info

View File

@ -26,7 +26,7 @@ def make_env(rank, seed=0):
num_dof=5, num_dof=5,
num_basis=5, num_basis=5,
duration=2, duration=2,
dt=env._dt, dt=env.dt,
learn_goal=True) learn_goal=True)
env.seed(seed + rank) env.seed(seed + rank)
return env return env

View File

@ -73,7 +73,7 @@ class AlrMujocoEnv(gym.Env):
# observation, _reward, done, _info = self.step(action) # observation, _reward, done, _info = self.step(action)
# assert not done # assert not done
observation = self.reset() observation = self._get_obs() # TODO: is calling get_obs enough? should we call reset, or even step?
self._set_observation_space(observation) self._set_observation_space(observation)
@ -82,14 +82,14 @@ class AlrMujocoEnv(gym.Env):
@property @property
def current_pos(self): def current_pos(self):
""" """
By default returns the joint positions of all simulated objects. May be overriden in subclass. By default returns the joint positions of all simulated objects. May be overridden in subclass.
""" """
return self.sim.data.qpos return self.sim.data.qpos
@property @property
def current_vel(self): def current_vel(self):
""" """
By default returns the joint velocities of all simulated objects. May be overriden in subclass. By default returns the joint velocities of all simulated objects. May be overridden in subclass.
""" """
return self.sim.data.qvel return self.sim.data.qvel
@ -125,10 +125,15 @@ class AlrMujocoEnv(gym.Env):
# methods to override: # methods to override:
# ---------------------------- # ----------------------------
def _get_obs(self):
"""Returns the observation.
"""
raise NotImplementedError()
def configure(self, *args, **kwargs): def configure(self, *args, **kwargs):
""" """
Helper method to set certain environment properties such as contexts in contextual environments since reset() Helper method to set certain environment properties such as contexts in contextual environments since reset()
doesn't take arguments. Should be called before/after reset(). TODO: before or after? doesn't take arguments. Should be called before reset().
""" """
pass pass

View File

@ -1,12 +1,12 @@
from gym.envs.mujoco import mujoco_env
from gym import utils from gym import utils
import os import os
import numpy as np import numpy as np
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward from alr_envs.mujoco import alr_mujoco_env
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
import mujoco_py import mujoco_py
class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle): class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
def __init__(self, ): def __init__(self, ):
self._steps = 0 self._steps = 0
@ -21,8 +21,12 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
self._q_pos = [] self._q_pos = []
utils.EzPickle.__init__(self) utils.EzPickle.__init__(self)
mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", "ball-in-a-cup_base.xml"), alr_mujoco_env.AlrMujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", "ball-in-a-cup_base.xml"),
frame_skip=4) n_substeps=4)
def configure(self, context):
self.context = context
self.reward_function.reset(context)
def reset_model(self): def reset_model(self):
start_pos = self.init_qpos.copy() start_pos = self.init_qpos.copy()
@ -30,24 +34,8 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
start_vel = np.zeros_like(start_pos) start_vel = np.zeros_like(start_pos)
self.set_state(start_pos, start_vel) self.set_state(start_pos, start_vel)
self._steps = 0 self._steps = 0
self.reward_function.reset()
self._q_pos = [] self._q_pos = []
def do_simulation(self, ctrl, n_frames):
self.sim.data.ctrl[:] = ctrl
for _ in range(n_frames):
try:
self.sim.step()
except mujoco_py.builder.MujocoException as e:
# print("Error in simulation: " + str(e))
# error = True
# Copy the current torque as if it would have been applied until the end of the trajectory
# for i in range(k + 1, sim_time):
# torques.append(trq)
return True
return False
def step(self, a): def step(self, a):
# Apply gravity compensation # Apply gravity compensation
if not np.all(self.sim.data.qfrc_applied[:7] == self.sim.data.qfrc_bias[:7]): if not np.all(self.sim.data.qfrc_applied[:7] == self.sim.data.qfrc_bias[:7]):
@ -98,6 +86,7 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
if __name__ == "__main__": if __name__ == "__main__":
env = ALRBallInACupEnv() env = ALRBallInACupEnv()
env.configure(None)
env.reset() env.reset()
for i in range(2000): for i in range(2000):
# objective.load_result("/tmp/cma") # objective.load_result("/tmp/cma")

View File

@ -26,9 +26,9 @@ class BallInACupReward(alr_reward_fct.AlrReward):
self.dists_final = None self.dists_final = None
self.costs = None self.costs = None
self.reset() self.reset(None)
def reset(self): def reset(self, context):
self.ball_traj = np.zeros(shape=(self.sim_time, 3)) self.ball_traj = np.zeros(shape=(self.sim_time, 3))
self.dists = [] self.dists = []
self.dists_final = [] self.dists_final = []
@ -51,11 +51,12 @@ class BallInACupReward(alr_reward_fct.AlrReward):
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
self.ball_traj[step, :] = ball_pos self.ball_traj[step, :] = ball_pos
if self.check_collision(sim):
return -1000, False, True
action_cost = np.sum(np.square(action)) action_cost = np.sum(np.square(action))
if self.check_collision(sim):
reward = - 1e-5 * action_cost - 1000
return reward, False, True
if step == self.sim_time - 1: if step == self.sim_time - 1:
min_dist = np.min(self.dists) min_dist = np.min(self.dists)
dist_final = self.dists_final[-1] dist_final = self.dists_final[-1]

View File

@ -1,21 +1,16 @@
from alr_envs.mujoco import alr_mujoco_env from alr_envs.mujoco import alr_mujoco_env
from gym import utils, spaces from gym import utils
import os import os
import numpy as np import numpy as np
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
def __init__(self, reward_function=None): def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None):
self._steps = 0 self._steps = 0
self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
"biac_base" + ".xml") "biac_base" + ".xml")
self.sim_time = 8 # seconds
self.sim_steps = int(self.sim_time / (0.0005 * 4)) # circular dependency.. sim.dt <-> mujocoenv init <-> reward fct
self.reward_function = reward_function(self.sim_steps)
self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57]) self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
self.start_vel = np.zeros(7) self.start_vel = np.zeros(7)
@ -34,8 +29,15 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
utils.EzPickle.__init__(self) utils.EzPickle.__init__(self)
alr_mujoco_env.AlrMujocoEnv.__init__(self, alr_mujoco_env.AlrMujocoEnv.__init__(self,
self.xml_path, self.xml_path,
apply_gravity_comp=True, apply_gravity_comp=apply_gravity_comp,
n_substeps=4) n_substeps=n_substeps)
self.sim_time = 8 # seconds
self.sim_steps = int(self.sim_time / self.dt)
if reward_function is None:
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
reward_function = BallInACupReward
self.reward_function = reward_function(self.sim_steps)
@property @property
def current_pos(self): def current_pos(self):
@ -47,6 +49,7 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
def configure(self, context): def configure(self, context):
self.context = context self.context = context
self.reward_function.reset(context)
def reset_model(self): def reset_model(self):
init_pos_all = self.init_qpos.copy() init_pos_all = self.init_qpos.copy()
@ -56,7 +59,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
goal_final_id = self.sim.model._site_name2id["cup_goal_final"] goal_final_id = self.sim.model._site_name2id["cup_goal_final"]
self._steps = 0 self._steps = 0
self.reward_function.reset()
self._q_pos = [] self._q_pos = []
self._q_vel = [] self._q_vel = []
@ -65,38 +67,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
self.set_state(start_pos, init_vel) self.set_state(start_pos, init_vel)
# Reset the system
# self.sim.data.qpos[:] = init_pos_all
# self.sim.data.qvel[:] = init_vel
# self.sim.data.qpos[0:7] = init_pos_robot
#
# self.sim.step()
#
# self.sim.data.qpos[:] = init_pos_all
# self.sim.data.qvel[:] = init_vel
# self.sim.data.qpos[0:7] = init_pos_robot
# self.sim.data.body_xpos[ball_id, :] = np.copy(self.sim.data.site_xpos[goal_final_id, :]) - np.array([0., 0., 0.329])
#
# # Stabilize the system around the initial position
# for i in range(0, 500):
# self.sim.data.qpos[7:] = 0.
# self.sim.data.qvel[7:] = 0.
# # self.sim.data.qpos[7] = -0.2
# cur_pos = self.sim.data.qpos[0:7].copy()
# cur_vel = self.sim.data.qvel[0:7].copy()
# trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel)
# self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy()
# self.sim.step()
# self.render()
#
# for i in range(0, 500):
# cur_pos = self.sim.data.qpos[0:7].copy()
# cur_vel = self.sim.data.qvel[0:7].copy()
# trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel)
# self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy()
# self.sim.step()
# self.render()
return self._get_obs() return self._get_obs()
def step(self, a): def step(self, a):
@ -154,7 +124,10 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
if __name__ == "__main__": if __name__ == "__main__":
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
env = ALRBallInACupEnv(reward_function=BallInACupReward) env = ALRBallInACupEnv(reward_function=BallInACupReward)
env.configure(None)
env.reset() env.reset()
env.render() env.render()
for i in range(4000): for i in range(4000):

View File

@ -7,9 +7,54 @@ import multiprocessing as mp
import sys import sys
def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
assert shared_memory is None
env = env_fn()
parent_pipe.close()
try:
while True:
command, data = pipe.recv()
if command == 'reset':
observation = env.reset()
pipe.send((observation, True))
elif command == 'step':
observation, reward, done, info = env.step(data)
if done:
observation = env.reset()
pipe.send(((observation, reward, done, info), True))
elif command == 'rollout':
rewards = []
infos = []
for p, c in zip(*data):
reward, info = env.rollout(p, c)
rewards.append(reward)
infos.append(info)
pipe.send(((rewards, infos), (True, ) * len(rewards)))
elif command == 'seed':
env.seed(data)
pipe.send((None, True))
elif command == 'close':
env.close()
pipe.send((None, True))
break
elif command == 'idle':
pipe.send((None, True))
elif command == '_check_observation_space':
pipe.send((data == env.observation_space, True))
else:
raise RuntimeError('Received unknown command `{0}`. Must '
'be one of {`reset`, `step`, `seed`, `close`, '
'`_check_observation_space`}.'.format(command))
except (KeyboardInterrupt, Exception):
error_queue.put((index,) + sys.exc_info()[:2])
pipe.send((None, False))
finally:
env.close()
class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv): class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
def __init__(self, env_fns, n_samples, observation_space=None, action_space=None, def __init__(self, env_fns, n_samples, observation_space=None, action_space=None,
shared_memory=True, copy=True, context=None, daemon=True, worker=None): shared_memory=False, copy=True, context="spawn", daemon=True, worker=_worker):
super(DmpAsyncVectorEnv, self).__init__(env_fns, super(DmpAsyncVectorEnv, self).__init__(env_fns,
observation_space=observation_space, observation_space=observation_space,
action_space=action_space, action_space=action_space,
@ -91,7 +136,7 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
self._raise_if_errors(successes) self._raise_if_errors(successes)
self._state = AsyncState.DEFAULT self._state = AsyncState.DEFAULT
observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)] rewards, infos = [_flatten_list(r) for r in zip(*results)]
# for now, we ignore the observations and only return the rewards # for now, we ignore the observations and only return the rewards
@ -109,55 +154,6 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
return self.rollout_wait() return self.rollout_wait()
def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
assert shared_memory is None
env = env_fn()
parent_pipe.close()
try:
while True:
command, data = pipe.recv()
if command == 'reset':
observation = env.reset()
pipe.send((observation, True))
elif command == 'step':
observation, reward, done, info = env.step(data)
if done:
observation = env.reset()
pipe.send(((observation, reward, done, info), True))
elif command == 'rollout':
observations = []
rewards = []
dones = []
infos = []
for p, c in zip(*data):
observation, reward, done, info = env.rollout(p, c)
observations.append(observation)
rewards.append(reward)
dones.append(done)
infos.append(info)
pipe.send(((observations, rewards, dones, infos), (True, ) * len(rewards)))
elif command == 'seed':
env.seed(data)
pipe.send((None, True))
elif command == 'close':
env.close()
pipe.send((None, True))
break
elif command == 'idle':
pipe.send((None, True))
elif command == '_check_observation_space':
pipe.send((data == env.observation_space, True))
else:
raise RuntimeError('Received unknown command `{0}`. Must '
'be one of {`reset`, `step`, `seed`, `close`, '
'`_check_observation_space`}.'.format(command))
except (KeyboardInterrupt, Exception):
error_queue.put((index,) + sys.exc_info()[:2])
pipe.send((None, False))
finally:
env.close()
def _flatten_obs(obs): def _flatten_obs(obs):
assert isinstance(obs, (list, tuple)) assert isinstance(obs, (list, tuple))
assert len(obs) > 0 assert len(obs) > 0

View File

@ -69,15 +69,11 @@ class DmpEnvWrapper(gym.Wrapper):
def __call__(self, params, contexts=None): def __call__(self, params, contexts=None):
params = np.atleast_2d(params) params = np.atleast_2d(params)
observations = []
rewards = [] rewards = []
dones = []
infos = [] infos = []
for p, c in zip(params, contexts): for p, c in zip(params, contexts):
observation, reward, done, info = self.rollout(p, c) reward, info = self.rollout(p, c)
observations.append(observation)
rewards.append(reward) rewards.append(reward)
dones.append(done)
infos.append(info) infos.append(info)
return np.array(rewards), infos return np.array(rewards), infos
@ -116,9 +112,8 @@ class DmpEnvWrapper(gym.Wrapper):
rews = [] rews = []
infos = [] infos = []
self.env.configure(context)
self.env.reset() self.env.reset()
if context is not None:
self.env.configure(context)
for t, pos_vel in enumerate(zip(trajectory, velocity)): for t, pos_vel in enumerate(zip(trajectory, velocity)):
ac = self.policy.get_action(pos_vel[0], pos_vel[1]) ac = self.policy.get_action(pos_vel[0], pos_vel[1])
@ -132,4 +127,4 @@ class DmpEnvWrapper(gym.Wrapper):
reward = np.sum(rews) reward = np.sum(rews)
return obs, reward, done, info return reward, info

View File

@ -1,4 +1,4 @@
from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapperVel from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper
from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker
from alr_envs.classic_control.hole_reacher import HoleReacher from alr_envs.classic_control.hole_reacher import HoleReacher
import numpy as np import numpy as np
@ -16,21 +16,25 @@ if __name__ == "__main__":
:param rank: (int) index of the subprocess :param rank: (int) index of the subprocess
""" """
def _init(): def _init():
env = HoleReacher(num_links=5, _env = HoleReacher(num_links=5,
allow_self_collision=False, allow_self_collision=False,
allow_wall_collision=False, allow_wall_collision=False,
hole_width=0.15, hole_width=0.15,
hole_depth=1, hole_depth=1,
hole_x=1) hole_x=1)
env = DmpEnvWrapperVel(env, _env = DmpEnvWrapper(_env,
num_dof=5, num_dof=5,
num_basis=5, num_basis=5,
duration=2, duration=2,
dt=env._dt, dt=_env.dt,
learn_goal=True) learn_goal=True,
env.seed(seed + rank) alpha_phase=2,
return env start_pos=_env.start_pos,
policy_type="velocity"
)
_env.seed(seed + rank)
return _env
return _init return _init
n_samples = 4 n_samples = 4
@ -45,6 +49,6 @@ if __name__ == "__main__":
params = np.hstack([50 * np.random.randn(n_samples, 25), np.tile(np.array([np.pi/2, -np.pi/4, -np.pi/4, -np.pi/4, -np.pi/4]), [n_samples, 1])]) params = np.hstack([50 * np.random.randn(n_samples, 25), np.tile(np.array([np.pi/2, -np.pi/4, -np.pi/4, -np.pi/4, -np.pi/4]), [n_samples, 1])])
# env.reset() # env.reset()
out = env.rollout(params) out = env(params)
print(out) print(out)

View File

@ -1,7 +1,6 @@
from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper
from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
import numpy as np import numpy as np
@ -17,7 +16,7 @@ if __name__ == "__main__":
:param rank: (int) index of the subprocess :param rank: (int) index of the subprocess
""" """
def _init(): def _init():
_env = ALRBallInACupEnv(BallInACupReward) _env = ALRBallInACupEnv()
_env = DmpEnvWrapper(_env, _env = DmpEnvWrapper(_env,
num_dof=3, num_dof=3,
@ -29,7 +28,6 @@ if __name__ == "__main__":
start_pos=_env.start_pos[1::2], start_pos=_env.start_pos[1::2],
final_pos=_env.start_pos[1::2], final_pos=_env.start_pos[1::2],
policy_type="motor" policy_type="motor"
# contextual=True
) )
_env.seed(seed + rank) _env.seed(seed + rank)
return _env return _env