updates
This commit is contained in:
parent
c81378b9e7
commit
13a292f0e0
@ -37,7 +37,7 @@ class HoleReacher(gym.Env):
|
||||
self.start_vel = np.zeros(self.num_links)
|
||||
self.weight_matrix_scale = 50 # for the holereacher, the dmp weights become quite large compared to the values of the goal attractor. this scaling is to ensure they are on similar scale for the optimizer
|
||||
|
||||
self._dt = 0.01
|
||||
self.dt = 0.01
|
||||
self.time_limit = 2
|
||||
|
||||
action_bound = np.pi * np.ones((self.num_links,))
|
||||
@ -82,9 +82,9 @@ class HoleReacher(gym.Env):
|
||||
a single step with an action in joint velocity space
|
||||
"""
|
||||
vel = action
|
||||
acc = (vel - self._angle_velocity) / self._dt
|
||||
acc = (vel - self._angle_velocity) / self.dt
|
||||
self._angle_velocity = vel
|
||||
self._joint_angles = self._joint_angles + self._dt * self._angle_velocity
|
||||
self._joint_angles = self._joint_angles + self.dt * self._angle_velocity
|
||||
|
||||
self._update_joints()
|
||||
|
||||
@ -113,7 +113,7 @@ class HoleReacher(gym.Env):
|
||||
|
||||
self._steps += 1
|
||||
|
||||
done = self._steps * self._dt > self.time_limit or self._is_collided
|
||||
done = self._steps * self.dt > self.time_limit or self._is_collided
|
||||
|
||||
return self._get_obs().copy(), reward, done, info
|
||||
|
||||
|
@ -26,7 +26,7 @@ def make_env(rank, seed=0):
|
||||
num_dof=5,
|
||||
num_basis=5,
|
||||
duration=2,
|
||||
dt=env._dt,
|
||||
dt=env.dt,
|
||||
learn_goal=True)
|
||||
env.seed(seed + rank)
|
||||
return env
|
||||
|
@ -73,7 +73,7 @@ class AlrMujocoEnv(gym.Env):
|
||||
# observation, _reward, done, _info = self.step(action)
|
||||
# assert not done
|
||||
|
||||
observation = self.reset()
|
||||
observation = self._get_obs() # TODO: is calling get_obs enough? should we call reset, or even step?
|
||||
|
||||
self._set_observation_space(observation)
|
||||
|
||||
@ -82,14 +82,14 @@ class AlrMujocoEnv(gym.Env):
|
||||
@property
|
||||
def current_pos(self):
|
||||
"""
|
||||
By default returns the joint positions of all simulated objects. May be overriden in subclass.
|
||||
By default returns the joint positions of all simulated objects. May be overridden in subclass.
|
||||
"""
|
||||
return self.sim.data.qpos
|
||||
|
||||
@property
|
||||
def current_vel(self):
|
||||
"""
|
||||
By default returns the joint velocities of all simulated objects. May be overriden in subclass.
|
||||
By default returns the joint velocities of all simulated objects. May be overridden in subclass.
|
||||
"""
|
||||
return self.sim.data.qvel
|
||||
|
||||
@ -125,10 +125,15 @@ class AlrMujocoEnv(gym.Env):
|
||||
# methods to override:
|
||||
# ----------------------------
|
||||
|
||||
def _get_obs(self):
|
||||
"""Returns the observation.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def configure(self, *args, **kwargs):
|
||||
"""
|
||||
Helper method to set certain environment properties such as contexts in contextual environments since reset()
|
||||
doesn't take arguments. Should be called before/after reset(). TODO: before or after?
|
||||
doesn't take arguments. Should be called before reset().
|
||||
"""
|
||||
pass
|
||||
|
||||
|
@ -1,12 +1,12 @@
|
||||
from gym.envs.mujoco import mujoco_env
|
||||
from gym import utils
|
||||
import os
|
||||
import numpy as np
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
|
||||
from alr_envs.mujoco import alr_mujoco_env
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
|
||||
import mujoco_py
|
||||
|
||||
|
||||
class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||
def __init__(self, ):
|
||||
self._steps = 0
|
||||
|
||||
@ -21,8 +21,12 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
self._q_pos = []
|
||||
|
||||
utils.EzPickle.__init__(self)
|
||||
mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", "ball-in-a-cup_base.xml"),
|
||||
frame_skip=4)
|
||||
alr_mujoco_env.AlrMujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", "ball-in-a-cup_base.xml"),
|
||||
n_substeps=4)
|
||||
|
||||
def configure(self, context):
|
||||
self.context = context
|
||||
self.reward_function.reset(context)
|
||||
|
||||
def reset_model(self):
|
||||
start_pos = self.init_qpos.copy()
|
||||
@ -30,24 +34,8 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
start_vel = np.zeros_like(start_pos)
|
||||
self.set_state(start_pos, start_vel)
|
||||
self._steps = 0
|
||||
self.reward_function.reset()
|
||||
self._q_pos = []
|
||||
|
||||
def do_simulation(self, ctrl, n_frames):
|
||||
self.sim.data.ctrl[:] = ctrl
|
||||
for _ in range(n_frames):
|
||||
try:
|
||||
self.sim.step()
|
||||
except mujoco_py.builder.MujocoException as e:
|
||||
# print("Error in simulation: " + str(e))
|
||||
# error = True
|
||||
# Copy the current torque as if it would have been applied until the end of the trajectory
|
||||
# for i in range(k + 1, sim_time):
|
||||
# torques.append(trq)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def step(self, a):
|
||||
# Apply gravity compensation
|
||||
if not np.all(self.sim.data.qfrc_applied[:7] == self.sim.data.qfrc_bias[:7]):
|
||||
@ -98,6 +86,7 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle):
|
||||
|
||||
if __name__ == "__main__":
|
||||
env = ALRBallInACupEnv()
|
||||
env.configure(None)
|
||||
env.reset()
|
||||
for i in range(2000):
|
||||
# objective.load_result("/tmp/cma")
|
||||
|
@ -26,9 +26,9 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
||||
self.dists_final = None
|
||||
self.costs = None
|
||||
|
||||
self.reset()
|
||||
self.reset(None)
|
||||
|
||||
def reset(self):
|
||||
def reset(self, context):
|
||||
self.ball_traj = np.zeros(shape=(self.sim_time, 3))
|
||||
self.dists = []
|
||||
self.dists_final = []
|
||||
@ -51,11 +51,12 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
||||
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
|
||||
self.ball_traj[step, :] = ball_pos
|
||||
|
||||
if self.check_collision(sim):
|
||||
return -1000, False, True
|
||||
|
||||
action_cost = np.sum(np.square(action))
|
||||
|
||||
if self.check_collision(sim):
|
||||
reward = - 1e-5 * action_cost - 1000
|
||||
return reward, False, True
|
||||
|
||||
if step == self.sim_time - 1:
|
||||
min_dist = np.min(self.dists)
|
||||
dist_final = self.dists_final[-1]
|
@ -1,21 +1,16 @@
|
||||
from alr_envs.mujoco import alr_mujoco_env
|
||||
from gym import utils, spaces
|
||||
from gym import utils
|
||||
import os
|
||||
import numpy as np
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
|
||||
|
||||
|
||||
class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||
def __init__(self, reward_function=None):
|
||||
def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None):
|
||||
self._steps = 0
|
||||
|
||||
self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
|
||||
"biac_base" + ".xml")
|
||||
|
||||
self.sim_time = 8 # seconds
|
||||
self.sim_steps = int(self.sim_time / (0.0005 * 4)) # circular dependency.. sim.dt <-> mujocoenv init <-> reward fct
|
||||
self.reward_function = reward_function(self.sim_steps)
|
||||
|
||||
self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
|
||||
self.start_vel = np.zeros(7)
|
||||
|
||||
@ -34,8 +29,15 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||
utils.EzPickle.__init__(self)
|
||||
alr_mujoco_env.AlrMujocoEnv.__init__(self,
|
||||
self.xml_path,
|
||||
apply_gravity_comp=True,
|
||||
n_substeps=4)
|
||||
apply_gravity_comp=apply_gravity_comp,
|
||||
n_substeps=n_substeps)
|
||||
|
||||
self.sim_time = 8 # seconds
|
||||
self.sim_steps = int(self.sim_time / self.dt)
|
||||
if reward_function is None:
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
|
||||
reward_function = BallInACupReward
|
||||
self.reward_function = reward_function(self.sim_steps)
|
||||
|
||||
@property
|
||||
def current_pos(self):
|
||||
@ -47,6 +49,7 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||
|
||||
def configure(self, context):
|
||||
self.context = context
|
||||
self.reward_function.reset(context)
|
||||
|
||||
def reset_model(self):
|
||||
init_pos_all = self.init_qpos.copy()
|
||||
@ -56,7 +59,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||
goal_final_id = self.sim.model._site_name2id["cup_goal_final"]
|
||||
|
||||
self._steps = 0
|
||||
self.reward_function.reset()
|
||||
self._q_pos = []
|
||||
self._q_vel = []
|
||||
|
||||
@ -65,38 +67,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||
|
||||
self.set_state(start_pos, init_vel)
|
||||
|
||||
# Reset the system
|
||||
# self.sim.data.qpos[:] = init_pos_all
|
||||
# self.sim.data.qvel[:] = init_vel
|
||||
# self.sim.data.qpos[0:7] = init_pos_robot
|
||||
#
|
||||
# self.sim.step()
|
||||
#
|
||||
# self.sim.data.qpos[:] = init_pos_all
|
||||
# self.sim.data.qvel[:] = init_vel
|
||||
# self.sim.data.qpos[0:7] = init_pos_robot
|
||||
# self.sim.data.body_xpos[ball_id, :] = np.copy(self.sim.data.site_xpos[goal_final_id, :]) - np.array([0., 0., 0.329])
|
||||
#
|
||||
# # Stabilize the system around the initial position
|
||||
# for i in range(0, 500):
|
||||
# self.sim.data.qpos[7:] = 0.
|
||||
# self.sim.data.qvel[7:] = 0.
|
||||
# # self.sim.data.qpos[7] = -0.2
|
||||
# cur_pos = self.sim.data.qpos[0:7].copy()
|
||||
# cur_vel = self.sim.data.qvel[0:7].copy()
|
||||
# trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel)
|
||||
# self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy()
|
||||
# self.sim.step()
|
||||
# self.render()
|
||||
#
|
||||
# for i in range(0, 500):
|
||||
# cur_pos = self.sim.data.qpos[0:7].copy()
|
||||
# cur_vel = self.sim.data.qvel[0:7].copy()
|
||||
# trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel)
|
||||
# self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy()
|
||||
# self.sim.step()
|
||||
# self.render()
|
||||
|
||||
return self._get_obs()
|
||||
|
||||
def step(self, a):
|
||||
@ -154,7 +124,10 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
|
||||
|
||||
env = ALRBallInACupEnv(reward_function=BallInACupReward)
|
||||
env.configure(None)
|
||||
env.reset()
|
||||
env.render()
|
||||
for i in range(4000):
|
||||
|
@ -7,9 +7,54 @@ import multiprocessing as mp
|
||||
import sys
|
||||
|
||||
|
||||
def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
|
||||
assert shared_memory is None
|
||||
env = env_fn()
|
||||
parent_pipe.close()
|
||||
try:
|
||||
while True:
|
||||
command, data = pipe.recv()
|
||||
if command == 'reset':
|
||||
observation = env.reset()
|
||||
pipe.send((observation, True))
|
||||
elif command == 'step':
|
||||
observation, reward, done, info = env.step(data)
|
||||
if done:
|
||||
observation = env.reset()
|
||||
pipe.send(((observation, reward, done, info), True))
|
||||
elif command == 'rollout':
|
||||
rewards = []
|
||||
infos = []
|
||||
for p, c in zip(*data):
|
||||
reward, info = env.rollout(p, c)
|
||||
rewards.append(reward)
|
||||
infos.append(info)
|
||||
pipe.send(((rewards, infos), (True, ) * len(rewards)))
|
||||
elif command == 'seed':
|
||||
env.seed(data)
|
||||
pipe.send((None, True))
|
||||
elif command == 'close':
|
||||
env.close()
|
||||
pipe.send((None, True))
|
||||
break
|
||||
elif command == 'idle':
|
||||
pipe.send((None, True))
|
||||
elif command == '_check_observation_space':
|
||||
pipe.send((data == env.observation_space, True))
|
||||
else:
|
||||
raise RuntimeError('Received unknown command `{0}`. Must '
|
||||
'be one of {`reset`, `step`, `seed`, `close`, '
|
||||
'`_check_observation_space`}.'.format(command))
|
||||
except (KeyboardInterrupt, Exception):
|
||||
error_queue.put((index,) + sys.exc_info()[:2])
|
||||
pipe.send((None, False))
|
||||
finally:
|
||||
env.close()
|
||||
|
||||
|
||||
class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
|
||||
def __init__(self, env_fns, n_samples, observation_space=None, action_space=None,
|
||||
shared_memory=True, copy=True, context=None, daemon=True, worker=None):
|
||||
shared_memory=False, copy=True, context="spawn", daemon=True, worker=_worker):
|
||||
super(DmpAsyncVectorEnv, self).__init__(env_fns,
|
||||
observation_space=observation_space,
|
||||
action_space=action_space,
|
||||
@ -91,7 +136,7 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
|
||||
self._raise_if_errors(successes)
|
||||
self._state = AsyncState.DEFAULT
|
||||
|
||||
observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)]
|
||||
rewards, infos = [_flatten_list(r) for r in zip(*results)]
|
||||
|
||||
# for now, we ignore the observations and only return the rewards
|
||||
|
||||
@ -109,55 +154,6 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv):
|
||||
return self.rollout_wait()
|
||||
|
||||
|
||||
def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue):
|
||||
assert shared_memory is None
|
||||
env = env_fn()
|
||||
parent_pipe.close()
|
||||
try:
|
||||
while True:
|
||||
command, data = pipe.recv()
|
||||
if command == 'reset':
|
||||
observation = env.reset()
|
||||
pipe.send((observation, True))
|
||||
elif command == 'step':
|
||||
observation, reward, done, info = env.step(data)
|
||||
if done:
|
||||
observation = env.reset()
|
||||
pipe.send(((observation, reward, done, info), True))
|
||||
elif command == 'rollout':
|
||||
observations = []
|
||||
rewards = []
|
||||
dones = []
|
||||
infos = []
|
||||
for p, c in zip(*data):
|
||||
observation, reward, done, info = env.rollout(p, c)
|
||||
observations.append(observation)
|
||||
rewards.append(reward)
|
||||
dones.append(done)
|
||||
infos.append(info)
|
||||
pipe.send(((observations, rewards, dones, infos), (True, ) * len(rewards)))
|
||||
elif command == 'seed':
|
||||
env.seed(data)
|
||||
pipe.send((None, True))
|
||||
elif command == 'close':
|
||||
env.close()
|
||||
pipe.send((None, True))
|
||||
break
|
||||
elif command == 'idle':
|
||||
pipe.send((None, True))
|
||||
elif command == '_check_observation_space':
|
||||
pipe.send((data == env.observation_space, True))
|
||||
else:
|
||||
raise RuntimeError('Received unknown command `{0}`. Must '
|
||||
'be one of {`reset`, `step`, `seed`, `close`, '
|
||||
'`_check_observation_space`}.'.format(command))
|
||||
except (KeyboardInterrupt, Exception):
|
||||
error_queue.put((index,) + sys.exc_info()[:2])
|
||||
pipe.send((None, False))
|
||||
finally:
|
||||
env.close()
|
||||
|
||||
|
||||
def _flatten_obs(obs):
|
||||
assert isinstance(obs, (list, tuple))
|
||||
assert len(obs) > 0
|
||||
|
@ -69,15 +69,11 @@ class DmpEnvWrapper(gym.Wrapper):
|
||||
|
||||
def __call__(self, params, contexts=None):
|
||||
params = np.atleast_2d(params)
|
||||
observations = []
|
||||
rewards = []
|
||||
dones = []
|
||||
infos = []
|
||||
for p, c in zip(params, contexts):
|
||||
observation, reward, done, info = self.rollout(p, c)
|
||||
observations.append(observation)
|
||||
reward, info = self.rollout(p, c)
|
||||
rewards.append(reward)
|
||||
dones.append(done)
|
||||
infos.append(info)
|
||||
|
||||
return np.array(rewards), infos
|
||||
@ -116,9 +112,8 @@ class DmpEnvWrapper(gym.Wrapper):
|
||||
rews = []
|
||||
infos = []
|
||||
|
||||
self.env.reset()
|
||||
if context is not None:
|
||||
self.env.configure(context)
|
||||
self.env.reset()
|
||||
|
||||
for t, pos_vel in enumerate(zip(trajectory, velocity)):
|
||||
ac = self.policy.get_action(pos_vel[0], pos_vel[1])
|
||||
@ -132,4 +127,4 @@ class DmpEnvWrapper(gym.Wrapper):
|
||||
|
||||
reward = np.sum(rews)
|
||||
|
||||
return obs, reward, done, info
|
||||
return reward, info
|
||||
|
@ -1,4 +1,4 @@
|
||||
from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapperVel
|
||||
from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper
|
||||
from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker
|
||||
from alr_envs.classic_control.hole_reacher import HoleReacher
|
||||
import numpy as np
|
||||
@ -16,21 +16,25 @@ if __name__ == "__main__":
|
||||
:param rank: (int) index of the subprocess
|
||||
"""
|
||||
def _init():
|
||||
env = HoleReacher(num_links=5,
|
||||
_env = HoleReacher(num_links=5,
|
||||
allow_self_collision=False,
|
||||
allow_wall_collision=False,
|
||||
hole_width=0.15,
|
||||
hole_depth=1,
|
||||
hole_x=1)
|
||||
|
||||
env = DmpEnvWrapperVel(env,
|
||||
_env = DmpEnvWrapper(_env,
|
||||
num_dof=5,
|
||||
num_basis=5,
|
||||
duration=2,
|
||||
dt=env._dt,
|
||||
learn_goal=True)
|
||||
env.seed(seed + rank)
|
||||
return env
|
||||
dt=_env.dt,
|
||||
learn_goal=True,
|
||||
alpha_phase=2,
|
||||
start_pos=_env.start_pos,
|
||||
policy_type="velocity"
|
||||
)
|
||||
_env.seed(seed + rank)
|
||||
return _env
|
||||
return _init
|
||||
|
||||
n_samples = 4
|
||||
@ -45,6 +49,6 @@ if __name__ == "__main__":
|
||||
params = np.hstack([50 * np.random.randn(n_samples, 25), np.tile(np.array([np.pi/2, -np.pi/4, -np.pi/4, -np.pi/4, -np.pi/4]), [n_samples, 1])])
|
||||
|
||||
# env.reset()
|
||||
out = env.rollout(params)
|
||||
out = env(params)
|
||||
|
||||
print(out)
|
||||
|
@ -1,7 +1,6 @@
|
||||
from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper
|
||||
from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
|
||||
import numpy as np
|
||||
|
||||
|
||||
@ -17,7 +16,7 @@ if __name__ == "__main__":
|
||||
:param rank: (int) index of the subprocess
|
||||
"""
|
||||
def _init():
|
||||
_env = ALRBallInACupEnv(BallInACupReward)
|
||||
_env = ALRBallInACupEnv()
|
||||
|
||||
_env = DmpEnvWrapper(_env,
|
||||
num_dof=3,
|
||||
@ -29,7 +28,6 @@ if __name__ == "__main__":
|
||||
start_pos=_env.start_pos[1::2],
|
||||
final_pos=_env.start_pos[1::2],
|
||||
policy_type="motor"
|
||||
# contextual=True
|
||||
)
|
||||
_env.seed(seed + rank)
|
||||
return _env
|
||||
|
Loading…
Reference in New Issue
Block a user