From 13a292f0e078df39280d9ff2d1a466f2e43f72eb Mon Sep 17 00:00:00 2001 From: Maximilian Huettenrauch Date: Thu, 11 Feb 2021 12:32:32 +0100 Subject: [PATCH] updates --- alr_envs/classic_control/hole_reacher.py | 8 +- alr_envs/classic_control/utils.py | 2 +- alr_envs/mujoco/alr_mujoco_env.py | 13 ++- .../mujoco/ball_in_a_cup/ball_in_a_cup.py | 31 ++---- ...ward.py => ball_in_a_cup_reward_simple.py} | 11 ++- .../ball_in_a_cup/ball_in_a_cup_simple.py | 57 +++-------- alr_envs/utils/dmp_async_vec_env.py | 98 +++++++++---------- alr_envs/utils/dmp_env_wrapper.py | 11 +-- dmp_env_wrapper_example.py | 36 ++++--- dmp_pd_control_example.py | 4 +- 10 files changed, 116 insertions(+), 155 deletions(-) rename alr_envs/mujoco/ball_in_a_cup/{ball_in_a_cup_reward.py => ball_in_a_cup_reward_simple.py} (96%) diff --git a/alr_envs/classic_control/hole_reacher.py b/alr_envs/classic_control/hole_reacher.py index 0db772a..eeba84d 100644 --- a/alr_envs/classic_control/hole_reacher.py +++ b/alr_envs/classic_control/hole_reacher.py @@ -37,7 +37,7 @@ class HoleReacher(gym.Env): self.start_vel = np.zeros(self.num_links) self.weight_matrix_scale = 50 # for the holereacher, the dmp weights become quite large compared to the values of the goal attractor. this scaling is to ensure they are on similar scale for the optimizer - self._dt = 0.01 + self.dt = 0.01 self.time_limit = 2 action_bound = np.pi * np.ones((self.num_links,)) @@ -82,9 +82,9 @@ class HoleReacher(gym.Env): a single step with an action in joint velocity space """ vel = action - acc = (vel - self._angle_velocity) / self._dt + acc = (vel - self._angle_velocity) / self.dt self._angle_velocity = vel - self._joint_angles = self._joint_angles + self._dt * self._angle_velocity + self._joint_angles = self._joint_angles + self.dt * self._angle_velocity self._update_joints() @@ -113,7 +113,7 @@ class HoleReacher(gym.Env): self._steps += 1 - done = self._steps * self._dt > self.time_limit or self._is_collided + done = self._steps * self.dt > self.time_limit or self._is_collided return self._get_obs().copy(), reward, done, info diff --git a/alr_envs/classic_control/utils.py b/alr_envs/classic_control/utils.py index 61156f1..b534eb9 100644 --- a/alr_envs/classic_control/utils.py +++ b/alr_envs/classic_control/utils.py @@ -26,7 +26,7 @@ def make_env(rank, seed=0): num_dof=5, num_basis=5, duration=2, - dt=env._dt, + dt=env.dt, learn_goal=True) env.seed(seed + rank) return env diff --git a/alr_envs/mujoco/alr_mujoco_env.py b/alr_envs/mujoco/alr_mujoco_env.py index c94beeb..c58dfe7 100644 --- a/alr_envs/mujoco/alr_mujoco_env.py +++ b/alr_envs/mujoco/alr_mujoco_env.py @@ -73,7 +73,7 @@ class AlrMujocoEnv(gym.Env): # observation, _reward, done, _info = self.step(action) # assert not done - observation = self.reset() + observation = self._get_obs() # TODO: is calling get_obs enough? should we call reset, or even step? self._set_observation_space(observation) @@ -82,14 +82,14 @@ class AlrMujocoEnv(gym.Env): @property def current_pos(self): """ - By default returns the joint positions of all simulated objects. May be overriden in subclass. + By default returns the joint positions of all simulated objects. May be overridden in subclass. """ return self.sim.data.qpos @property def current_vel(self): """ - By default returns the joint velocities of all simulated objects. May be overriden in subclass. + By default returns the joint velocities of all simulated objects. May be overridden in subclass. """ return self.sim.data.qvel @@ -125,10 +125,15 @@ class AlrMujocoEnv(gym.Env): # methods to override: # ---------------------------- + def _get_obs(self): + """Returns the observation. + """ + raise NotImplementedError() + def configure(self, *args, **kwargs): """ Helper method to set certain environment properties such as contexts in contextual environments since reset() - doesn't take arguments. Should be called before/after reset(). TODO: before or after? + doesn't take arguments. Should be called before reset(). """ pass diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py index 093e7f1..ad2e52a 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py @@ -1,12 +1,12 @@ -from gym.envs.mujoco import mujoco_env from gym import utils import os import numpy as np -from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward +from alr_envs.mujoco import alr_mujoco_env +from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward import mujoco_py -class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle): +class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): def __init__(self, ): self._steps = 0 @@ -21,8 +21,12 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle): self._q_pos = [] utils.EzPickle.__init__(self) - mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", "ball-in-a-cup_base.xml"), - frame_skip=4) + alr_mujoco_env.AlrMujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", "ball-in-a-cup_base.xml"), + n_substeps=4) + + def configure(self, context): + self.context = context + self.reward_function.reset(context) def reset_model(self): start_pos = self.init_qpos.copy() @@ -30,24 +34,8 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle): start_vel = np.zeros_like(start_pos) self.set_state(start_pos, start_vel) self._steps = 0 - self.reward_function.reset() self._q_pos = [] - def do_simulation(self, ctrl, n_frames): - self.sim.data.ctrl[:] = ctrl - for _ in range(n_frames): - try: - self.sim.step() - except mujoco_py.builder.MujocoException as e: - # print("Error in simulation: " + str(e)) - # error = True - # Copy the current torque as if it would have been applied until the end of the trajectory - # for i in range(k + 1, sim_time): - # torques.append(trq) - return True - - return False - def step(self, a): # Apply gravity compensation if not np.all(self.sim.data.qfrc_applied[:7] == self.sim.data.qfrc_bias[:7]): @@ -98,6 +86,7 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle): if __name__ == "__main__": env = ALRBallInACupEnv() + env.configure(None) env.reset() for i in range(2000): # objective.load_result("/tmp/cma") diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py similarity index 96% rename from alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py rename to alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py index 32310b3..22aa363 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py @@ -26,9 +26,9 @@ class BallInACupReward(alr_reward_fct.AlrReward): self.dists_final = None self.costs = None - self.reset() + self.reset(None) - def reset(self): + def reset(self, context): self.ball_traj = np.zeros(shape=(self.sim_time, 3)) self.dists = [] self.dists_final = [] @@ -51,11 +51,12 @@ class BallInACupReward(alr_reward_fct.AlrReward): self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) self.ball_traj[step, :] = ball_pos - if self.check_collision(sim): - return -1000, False, True - action_cost = np.sum(np.square(action)) + if self.check_collision(sim): + reward = - 1e-5 * action_cost - 1000 + return reward, False, True + if step == self.sim_time - 1: min_dist = np.min(self.dists) dist_final = self.dists_final[-1] diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py index 1a25211..efc702c 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py @@ -1,21 +1,16 @@ from alr_envs.mujoco import alr_mujoco_env -from gym import utils, spaces +from gym import utils import os import numpy as np -from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): - def __init__(self, reward_function=None): + def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None): self._steps = 0 self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "biac_base" + ".xml") - self.sim_time = 8 # seconds - self.sim_steps = int(self.sim_time / (0.0005 * 4)) # circular dependency.. sim.dt <-> mujocoenv init <-> reward fct - self.reward_function = reward_function(self.sim_steps) - self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57]) self.start_vel = np.zeros(7) @@ -34,8 +29,15 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): utils.EzPickle.__init__(self) alr_mujoco_env.AlrMujocoEnv.__init__(self, self.xml_path, - apply_gravity_comp=True, - n_substeps=4) + apply_gravity_comp=apply_gravity_comp, + n_substeps=n_substeps) + + self.sim_time = 8 # seconds + self.sim_steps = int(self.sim_time / self.dt) + if reward_function is None: + from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward + reward_function = BallInACupReward + self.reward_function = reward_function(self.sim_steps) @property def current_pos(self): @@ -47,6 +49,7 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): def configure(self, context): self.context = context + self.reward_function.reset(context) def reset_model(self): init_pos_all = self.init_qpos.copy() @@ -56,7 +59,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): goal_final_id = self.sim.model._site_name2id["cup_goal_final"] self._steps = 0 - self.reward_function.reset() self._q_pos = [] self._q_vel = [] @@ -65,38 +67,6 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): self.set_state(start_pos, init_vel) - # Reset the system - # self.sim.data.qpos[:] = init_pos_all - # self.sim.data.qvel[:] = init_vel - # self.sim.data.qpos[0:7] = init_pos_robot - # - # self.sim.step() - # - # self.sim.data.qpos[:] = init_pos_all - # self.sim.data.qvel[:] = init_vel - # self.sim.data.qpos[0:7] = init_pos_robot - # self.sim.data.body_xpos[ball_id, :] = np.copy(self.sim.data.site_xpos[goal_final_id, :]) - np.array([0., 0., 0.329]) - # - # # Stabilize the system around the initial position - # for i in range(0, 500): - # self.sim.data.qpos[7:] = 0. - # self.sim.data.qvel[7:] = 0. - # # self.sim.data.qpos[7] = -0.2 - # cur_pos = self.sim.data.qpos[0:7].copy() - # cur_vel = self.sim.data.qvel[0:7].copy() - # trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel) - # self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy() - # self.sim.step() - # self.render() - # - # for i in range(0, 500): - # cur_pos = self.sim.data.qpos[0:7].copy() - # cur_vel = self.sim.data.qvel[0:7].copy() - # trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel) - # self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy() - # self.sim.step() - # self.render() - return self._get_obs() def step(self, a): @@ -154,7 +124,10 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): if __name__ == "__main__": + from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward + env = ALRBallInACupEnv(reward_function=BallInACupReward) + env.configure(None) env.reset() env.render() for i in range(4000): diff --git a/alr_envs/utils/dmp_async_vec_env.py b/alr_envs/utils/dmp_async_vec_env.py index 771b0d8..641e770 100644 --- a/alr_envs/utils/dmp_async_vec_env.py +++ b/alr_envs/utils/dmp_async_vec_env.py @@ -7,9 +7,54 @@ import multiprocessing as mp import sys +def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue): + assert shared_memory is None + env = env_fn() + parent_pipe.close() + try: + while True: + command, data = pipe.recv() + if command == 'reset': + observation = env.reset() + pipe.send((observation, True)) + elif command == 'step': + observation, reward, done, info = env.step(data) + if done: + observation = env.reset() + pipe.send(((observation, reward, done, info), True)) + elif command == 'rollout': + rewards = [] + infos = [] + for p, c in zip(*data): + reward, info = env.rollout(p, c) + rewards.append(reward) + infos.append(info) + pipe.send(((rewards, infos), (True, ) * len(rewards))) + elif command == 'seed': + env.seed(data) + pipe.send((None, True)) + elif command == 'close': + env.close() + pipe.send((None, True)) + break + elif command == 'idle': + pipe.send((None, True)) + elif command == '_check_observation_space': + pipe.send((data == env.observation_space, True)) + else: + raise RuntimeError('Received unknown command `{0}`. Must ' + 'be one of {`reset`, `step`, `seed`, `close`, ' + '`_check_observation_space`}.'.format(command)) + except (KeyboardInterrupt, Exception): + error_queue.put((index,) + sys.exc_info()[:2]) + pipe.send((None, False)) + finally: + env.close() + + class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv): def __init__(self, env_fns, n_samples, observation_space=None, action_space=None, - shared_memory=True, copy=True, context=None, daemon=True, worker=None): + shared_memory=False, copy=True, context="spawn", daemon=True, worker=_worker): super(DmpAsyncVectorEnv, self).__init__(env_fns, observation_space=observation_space, action_space=action_space, @@ -91,7 +136,7 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv): self._raise_if_errors(successes) self._state = AsyncState.DEFAULT - observations_list, rewards, dones, infos = [_flatten_list(r) for r in zip(*results)] + rewards, infos = [_flatten_list(r) for r in zip(*results)] # for now, we ignore the observations and only return the rewards @@ -109,55 +154,6 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv): return self.rollout_wait() -def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue): - assert shared_memory is None - env = env_fn() - parent_pipe.close() - try: - while True: - command, data = pipe.recv() - if command == 'reset': - observation = env.reset() - pipe.send((observation, True)) - elif command == 'step': - observation, reward, done, info = env.step(data) - if done: - observation = env.reset() - pipe.send(((observation, reward, done, info), True)) - elif command == 'rollout': - observations = [] - rewards = [] - dones = [] - infos = [] - for p, c in zip(*data): - observation, reward, done, info = env.rollout(p, c) - observations.append(observation) - rewards.append(reward) - dones.append(done) - infos.append(info) - pipe.send(((observations, rewards, dones, infos), (True, ) * len(rewards))) - elif command == 'seed': - env.seed(data) - pipe.send((None, True)) - elif command == 'close': - env.close() - pipe.send((None, True)) - break - elif command == 'idle': - pipe.send((None, True)) - elif command == '_check_observation_space': - pipe.send((data == env.observation_space, True)) - else: - raise RuntimeError('Received unknown command `{0}`. Must ' - 'be one of {`reset`, `step`, `seed`, `close`, ' - '`_check_observation_space`}.'.format(command)) - except (KeyboardInterrupt, Exception): - error_queue.put((index,) + sys.exc_info()[:2]) - pipe.send((None, False)) - finally: - env.close() - - def _flatten_obs(obs): assert isinstance(obs, (list, tuple)) assert len(obs) > 0 diff --git a/alr_envs/utils/dmp_env_wrapper.py b/alr_envs/utils/dmp_env_wrapper.py index 3b461f7..3edc69f 100644 --- a/alr_envs/utils/dmp_env_wrapper.py +++ b/alr_envs/utils/dmp_env_wrapper.py @@ -69,15 +69,11 @@ class DmpEnvWrapper(gym.Wrapper): def __call__(self, params, contexts=None): params = np.atleast_2d(params) - observations = [] rewards = [] - dones = [] infos = [] for p, c in zip(params, contexts): - observation, reward, done, info = self.rollout(p, c) - observations.append(observation) + reward, info = self.rollout(p, c) rewards.append(reward) - dones.append(done) infos.append(info) return np.array(rewards), infos @@ -116,9 +112,8 @@ class DmpEnvWrapper(gym.Wrapper): rews = [] infos = [] + self.env.configure(context) self.env.reset() - if context is not None: - self.env.configure(context) for t, pos_vel in enumerate(zip(trajectory, velocity)): ac = self.policy.get_action(pos_vel[0], pos_vel[1]) @@ -132,4 +127,4 @@ class DmpEnvWrapper(gym.Wrapper): reward = np.sum(rews) - return obs, reward, done, info + return reward, info diff --git a/dmp_env_wrapper_example.py b/dmp_env_wrapper_example.py index fcde324..6ed73e7 100644 --- a/dmp_env_wrapper_example.py +++ b/dmp_env_wrapper_example.py @@ -1,4 +1,4 @@ -from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapperVel +from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker from alr_envs.classic_control.hole_reacher import HoleReacher import numpy as np @@ -16,21 +16,25 @@ if __name__ == "__main__": :param rank: (int) index of the subprocess """ def _init(): - env = HoleReacher(num_links=5, - allow_self_collision=False, - allow_wall_collision=False, - hole_width=0.15, - hole_depth=1, - hole_x=1) + _env = HoleReacher(num_links=5, + allow_self_collision=False, + allow_wall_collision=False, + hole_width=0.15, + hole_depth=1, + hole_x=1) - env = DmpEnvWrapperVel(env, - num_dof=5, - num_basis=5, - duration=2, - dt=env._dt, - learn_goal=True) - env.seed(seed + rank) - return env + _env = DmpEnvWrapper(_env, + num_dof=5, + num_basis=5, + duration=2, + dt=_env.dt, + learn_goal=True, + alpha_phase=2, + start_pos=_env.start_pos, + policy_type="velocity" + ) + _env.seed(seed + rank) + return _env return _init n_samples = 4 @@ -45,6 +49,6 @@ if __name__ == "__main__": params = np.hstack([50 * np.random.randn(n_samples, 25), np.tile(np.array([np.pi/2, -np.pi/4, -np.pi/4, -np.pi/4, -np.pi/4]), [n_samples, 1])]) # env.reset() - out = env.rollout(params) + out = env(params) print(out) diff --git a/dmp_pd_control_example.py b/dmp_pd_control_example.py index 9c4bec0..023080a 100644 --- a/dmp_pd_control_example.py +++ b/dmp_pd_control_example.py @@ -1,7 +1,6 @@ from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv -from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward import numpy as np @@ -17,7 +16,7 @@ if __name__ == "__main__": :param rank: (int) index of the subprocess """ def _init(): - _env = ALRBallInACupEnv(BallInACupReward) + _env = ALRBallInACupEnv() _env = DmpEnvWrapper(_env, num_dof=3, @@ -29,7 +28,6 @@ if __name__ == "__main__": start_pos=_env.start_pos[1::2], final_pos=_env.start_pos[1::2], policy_type="motor" - # contextual=True ) _env.seed(seed + rank) return _env