From c81378b9e754927f871f71e81e0565a8f5026e2c Mon Sep 17 00:00:00 2001 From: Maximilian Huettenrauch Date: Thu, 11 Feb 2021 10:49:57 +0100 Subject: [PATCH] support for contexts, policy classes, pd controller example, breaking changes etc --- alr_envs/mujoco/alr_mujoco_env.py | 247 ++++++++++++ alr_envs/mujoco/alr_reward_fct.py | 21 + .../mujoco/ball_in_a_cup/assets/biac_base.xml | 360 ++++++++++++++++++ .../ball_in_a_cup/ball_in_a_cup_reward.py | 44 +-- .../ball_in_a_cup/ball_in_a_cup_simple.py | 155 +++----- alr_envs/utils/dmp_async_vec_env.py | 32 +- alr_envs/utils/dmp_env_wrapper.py | 110 ++---- alr_envs/utils/policies.py | 40 +- dmp_pd_control_example.py | 60 +-- 9 files changed, 807 insertions(+), 262 deletions(-) create mode 100644 alr_envs/mujoco/alr_mujoco_env.py create mode 100644 alr_envs/mujoco/alr_reward_fct.py create mode 100644 alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml diff --git a/alr_envs/mujoco/alr_mujoco_env.py b/alr_envs/mujoco/alr_mujoco_env.py new file mode 100644 index 0000000..c94beeb --- /dev/null +++ b/alr_envs/mujoco/alr_mujoco_env.py @@ -0,0 +1,247 @@ +from collections import OrderedDict +import os + + +from gym import error, spaces +from gym.utils import seeding +import numpy as np +from os import path +import gym + +try: + import mujoco_py +except ImportError as e: + raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e)) + +DEFAULT_SIZE = 500 + + +def convert_observation_to_space(observation): + if isinstance(observation, dict): + space = spaces.Dict(OrderedDict([ + (key, convert_observation_to_space(value)) + for key, value in observation.items() + ])) + elif isinstance(observation, np.ndarray): + low = np.full(observation.shape, -float('inf'), dtype=np.float32) + high = np.full(observation.shape, float('inf'), dtype=np.float32) + space = spaces.Box(low, high, dtype=observation.dtype) + else: + raise NotImplementedError(type(observation), observation) + + return space + + +class AlrMujocoEnv(gym.Env): + """ + Superclass for all MuJoCo environments. + """ + + def __init__(self, model_path, n_substeps, apply_gravity_comp=True): + """ + + Args: + model_path: path to xml file + n_substeps: how many steps mujoco does per call to env.step + use_servo: use actuator defined in xml, use False for direct torque control + """ + if model_path.startswith("/"): + fullpath = model_path + else: + fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path) + if not path.exists(fullpath): + raise IOError("File %s does not exist" % fullpath) + self.n_substeps = n_substeps + self.apply_gravity_comp = apply_gravity_comp + self.model = mujoco_py.load_model_from_path(fullpath) + self.sim = mujoco_py.MjSim(self.model, nsubsteps=n_substeps) + self.data = self.sim.data + self.viewer = None + self._viewers = {} + + self.metadata = { + 'render.modes': ['human', 'rgb_array', 'depth_array'], + 'video.frames_per_second': int(np.round(1.0 / self.dt)) + } + + self.init_qpos = self.sim.data.qpos.ravel().copy() + self.init_qvel = self.sim.data.qvel.ravel().copy() + + self._set_action_space() + + # action = self.action_space.sample() + # observation, _reward, done, _info = self.step(action) + # assert not done + + observation = self.reset() + + self._set_observation_space(observation) + + self.seed() + + @property + def current_pos(self): + """ + By default returns the joint positions of all simulated objects. May be overriden in subclass. + """ + return self.sim.data.qpos + + @property + def current_vel(self): + """ + By default returns the joint velocities of all simulated objects. May be overriden in subclass. + """ + return self.sim.data.qvel + + def extend_des_pos(self, des_pos): + """ + In a simplified environment, the actions may only control a subset of all the joints. + Extend the trajectory to match the environments full action space + Args: + des_pos: + + Returns: + + """ + pass + + def extend_des_vel(self, des_vel): + pass + + def _set_action_space(self): + bounds = self.model.actuator_ctrlrange.copy().astype(np.float32) + low, high = bounds.T + self.action_space = spaces.Box(low=low, high=high, dtype=np.float32) + return self.action_space + + def _set_observation_space(self, observation): + self.observation_space = convert_observation_to_space(observation) + return self.observation_space + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + # methods to override: + # ---------------------------- + + def configure(self, *args, **kwargs): + """ + Helper method to set certain environment properties such as contexts in contextual environments since reset() + doesn't take arguments. Should be called before/after reset(). TODO: before or after? + """ + pass + + def reset_model(self): + """ + Reset the robot degrees of freedom (qpos and qvel). + Implement this in each subclass. + """ + raise NotImplementedError + + def viewer_setup(self): + """ + This method is called when the viewer is initialized. + Optionally implement this method, if you need to tinker with camera position + and so forth. + """ + pass + + # ----------------------------- + + def reset(self): + self.sim.reset() + ob = self.reset_model() + return ob + + def set_state(self, qpos, qvel): + assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,) + old_state = self.sim.get_state() + new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel, + old_state.act, old_state.udd_state) + self.sim.set_state(new_state) + self.sim.forward() + + @property + def dt(self): + return self.model.opt.timestep * self.n_substeps + + def do_simulation(self, ctrl): + """ + Additionally returns whether there was an error while stepping the simulation + """ + error_in_sim = False + num_actuations = len(ctrl) + if self.apply_gravity_comp: + self.sim.data.ctrl[:num_actuations] = ctrl + self.sim.data.qfrc_bias[:num_actuations].copy() / self.model.actuator_gear[:, 0] + else: + self.sim.data.ctrl[:num_actuations] = ctrl + + try: + self.sim.step() + except mujoco_py.builder.MujocoException as e: + error_in_sim = True + + return error_in_sim + + def render(self, + mode='human', + width=DEFAULT_SIZE, + height=DEFAULT_SIZE, + camera_id=None, + camera_name=None): + if mode == 'rgb_array' or mode == 'depth_array': + if camera_id is not None and camera_name is not None: + raise ValueError("Both `camera_id` and `camera_name` cannot be" + " specified at the same time.") + + no_camera_specified = camera_name is None and camera_id is None + if no_camera_specified: + camera_name = 'track' + + if camera_id is None and camera_name in self.model._camera_name2id: + camera_id = self.model.camera_name2id(camera_name) + + self._get_viewer(mode).render(width, height, camera_id=camera_id) + + if mode == 'rgb_array': + # window size used for old mujoco-py: + data = self._get_viewer(mode).read_pixels(width, height, depth=False) + # original image is upside-down, so flip it + return data[::-1, :, :] + elif mode == 'depth_array': + self._get_viewer(mode).render(width, height) + # window size used for old mujoco-py: + # Extract depth part of the read_pixels() tuple + data = self._get_viewer(mode).read_pixels(width, height, depth=True)[1] + # original image is upside-down, so flip it + return data[::-1, :] + elif mode == 'human': + self._get_viewer(mode).render() + + def close(self): + if self.viewer is not None: + # self.viewer.finish() + self.viewer = None + self._viewers = {} + + def _get_viewer(self, mode): + self.viewer = self._viewers.get(mode) + if self.viewer is None: + if mode == 'human': + self.viewer = mujoco_py.MjViewer(self.sim) + elif mode == 'rgb_array' or mode == 'depth_array': + self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, -1) + + self.viewer_setup() + self._viewers[mode] = self.viewer + return self.viewer + + def get_body_com(self, body_name): + return self.data.get_body_xpos(body_name) + + def state_vector(self): + return np.concatenate([ + self.sim.data.qpos.flat, + self.sim.data.qvel.flat + ]) diff --git a/alr_envs/mujoco/alr_reward_fct.py b/alr_envs/mujoco/alr_reward_fct.py new file mode 100644 index 0000000..c96dd07 --- /dev/null +++ b/alr_envs/mujoco/alr_reward_fct.py @@ -0,0 +1,21 @@ +class AlrReward: + """ + A base class for non-Markovian reward functions which may need trajectory information to calculate an episodic + reward. Call the methods in reset() and step() of the environment. + """ + + # methods to override: + # ---------------------------- + def reset(self, *args, **kwargs): + """ + Reset the reward function, empty state buffers before an episode, set contexts that influence reward, etc. + """ + raise NotImplementedError + + def compute_reward(self, *args, **kwargs): + """ + + Returns: Useful things to return are reward values, success flags or crash flags + + """ + raise NotImplementedError diff --git a/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml b/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml new file mode 100644 index 0000000..c8c2643 --- /dev/null +++ b/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml @@ -0,0 +1,360 @@ + + + diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py index 69932db..32310b3 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py @@ -1,7 +1,8 @@ import numpy as np +from alr_envs.mujoco import alr_reward_fct -class BallInACupReward: +class BallInACupReward(alr_reward_fct.AlrReward): def __init__(self, sim_time): self.sim_time = sim_time @@ -14,7 +15,6 @@ class BallInACupReward: "forearm_link_convex_decomposition_p1_geom", "forearm_link_convex_decomposition_p2_geom"] - self.ctxt_id = None self.ball_id = None self.ball_collision_id = None self.goal_id = None @@ -34,8 +34,7 @@ class BallInACupReward: self.dists_final = [] self.costs = [] - def compute_reward(self, action, sim, step): - self.ctxt_id = sim.model._site_name2id['context_point'] + def compute_reward(self, action, sim, step, context=None): self.ball_id = sim.model._body_name2id["ball"] self.ball_collision_id = sim.model._geom_name2id["ball_geom"] self.goal_id = sim.model._site_name2id["cup_goal"] @@ -50,59 +49,26 @@ class BallInACupReward: goal_final_pos = sim.data.site_xpos[self.goal_final_id] self.dists.append(np.linalg.norm(goal_pos - ball_pos)) self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) - # dists_ctxt.append(np.linalg.norm(ball_pos - ctxt)) self.ball_traj[step, :] = ball_pos if self.check_collision(sim): return -1000, False, True - # self._get_cost(ball_pos, goal_pos, goal_final_pos, action, - # sim.data.get_site_xpos('context_point').copy(), step) - - # min_dist = np.min(self.dists) - # dist_final = self.dists_final[-1] action_cost = np.sum(np.square(action)) - # cost = self.get_stage_wise_cost(ball_in_cup, min_dist, self.dists_final[-1]) # , self.dists_ctxt[-1]) if step == self.sim_time - 1: min_dist = np.min(self.dists) dist_final = self.dists_final[-1] cost = 0.5 * min_dist + 0.5 * dist_final - # cost = 3 + 2 * (0.5 * min_dist ** 2 + 0.5 * dist_final ** 2) - reward = np.exp(-2 * min_dist) - 1e-5 * action_cost - success = dist_final < 0.05 and min_dist < 0.05 + reward = np.exp(-2 * cost) - 1e-5 * action_cost + success = dist_final < 0.05 and ball_in_cup else: - cost = 0 reward = - 1e-5 * action_cost success = False - # action_cost = np.mean(np.sum(np.square(torques), axis=1), axis=0) return reward, success, False - def get_stage_wise_cost(self, ball_in_cup, min_dist, dist_final): #, dist_to_ctxt): - # stop_sim = False - cost = 3 + 2 * (0.5 * min_dist ** 2 + 0.5 * dist_final ** 2) - # if not ball_in_cup: - # # cost = 3 + 2*(0.5 * min_dist + 0.5 * dist_final) - # cost = 3 + 2*(0.5 * min_dist**2 + 0.5 * dist_final**2) - # else: - # # cost = 2*dist_to_ctxt - # cost = 2*dist_to_ctxt**2 - # stop_sim = True - # # print(dist_to_ctxt-0.02) - # print('Context Distance:', dist_to_ctxt) - return cost - - def _get_cost(self, ball_pos, goal_pos, goal_pos_final, u, ctxt, t): - - cost = 0 - if t == self.sim_time*0.8: - dist = 0.5*np.linalg.norm(goal_pos-ball_pos)**2 + 0.5*np.linalg.norm(goal_pos_final-ball_pos)**2 - # dist_ctxt = np.linalg.norm(ctxt-goal_pos)**2 - cost = dist # +dist_ctxt - return cost - def check_ball_in_cup(self, sim, ball_collision_id): cup_base_collision_id = sim.model._geom_name2id["cup_base_contact"] for coni in range(0, sim.data.ncon): diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py index 9161f17..1a25211 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py @@ -1,38 +1,41 @@ -from gym.envs.mujoco import mujoco_env +from alr_envs.mujoco import alr_mujoco_env from gym import utils, spaces import os import numpy as np from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward -import mujoco_py -class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle): - def __init__(self, pd_control=True): +class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): + def __init__(self, reward_function=None): self._steps = 0 - self.pd_control = pd_control self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", - "ball-in-a-cup_base" + ".xml") + "biac_base" + ".xml") self.sim_time = 8 # seconds self.sim_steps = int(self.sim_time / (0.0005 * 4)) # circular dependency.. sim.dt <-> mujocoenv init <-> reward fct - self.reward_function = BallInACupReward(self.sim_steps) + self.reward_function = reward_function(self.sim_steps) self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57]) self.start_vel = np.zeros(7) - # self.start_pos = np.array([0.58760536, 1.36004913, -0.32072943]) + self._q_pos = [] self._q_vel = [] # self.weight_matrix_scale = 50 - self.p_gains = 1*np.array([200, 300, 100, 100, 10, 10, 2.5]) - self.d_gains = 1*np.array([7, 15, 5, 2.5, 0.3, 0.3, 0.05]) + self.max_ctrl = np.array([150., 125., 40., 60., 5., 5., 2.]) + self.p_gains = 1 / self.max_ctrl * np.array([200, 300, 100, 100, 10, 10, 2.5]) + self.d_gains = 1 / self.max_ctrl * np.array([7, 15, 5, 2.5, 0.3, 0.3, 0.05]) self.j_min = np.array([-2.6, -1.985, -2.8, -0.9, -4.55, -1.5707, -2.7]) self.j_max = np.array([2.6, 1.985, 2.8, 3.14159, 1.25, 1.5707, 2.7]) + self.context = None + utils.EzPickle.__init__(self) - mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), "assets", "ball-in-a-cup_base.xml"), - frame_skip=4) + alr_mujoco_env.AlrMujocoEnv.__init__(self, + self.xml_path, + apply_gravity_comp=True, + n_substeps=4) @property def current_pos(self): @@ -42,96 +45,68 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle): def current_vel(self): return self.sim.data.qvel[0:7].copy() + def configure(self, context): + self.context = context + def reset_model(self): init_pos_all = self.init_qpos.copy() init_pos_robot = self.start_pos init_vel = np.zeros_like(init_pos_all) ball_id = self.sim.model._body_name2id["ball"] goal_final_id = self.sim.model._site_name2id["cup_goal_final"] - # self.set_state(start_pos, start_vel) + self._steps = 0 self.reward_function.reset() self._q_pos = [] self._q_vel = [] + start_pos = init_pos_all + start_pos[0:7] = init_pos_robot + + self.set_state(start_pos, init_vel) + # Reset the system - self.sim.data.qpos[:] = init_pos_all - self.sim.data.qvel[:] = init_vel - self.sim.data.qpos[0:7] = init_pos_robot + # self.sim.data.qpos[:] = init_pos_all + # self.sim.data.qvel[:] = init_vel + # self.sim.data.qpos[0:7] = init_pos_robot + # + # self.sim.step() + # + # self.sim.data.qpos[:] = init_pos_all + # self.sim.data.qvel[:] = init_vel + # self.sim.data.qpos[0:7] = init_pos_robot + # self.sim.data.body_xpos[ball_id, :] = np.copy(self.sim.data.site_xpos[goal_final_id, :]) - np.array([0., 0., 0.329]) + # + # # Stabilize the system around the initial position + # for i in range(0, 500): + # self.sim.data.qpos[7:] = 0. + # self.sim.data.qvel[7:] = 0. + # # self.sim.data.qpos[7] = -0.2 + # cur_pos = self.sim.data.qpos[0:7].copy() + # cur_vel = self.sim.data.qvel[0:7].copy() + # trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel) + # self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy() + # self.sim.step() + # self.render() + # + # for i in range(0, 500): + # cur_pos = self.sim.data.qpos[0:7].copy() + # cur_vel = self.sim.data.qvel[0:7].copy() + # trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel) + # self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy() + # self.sim.step() + # self.render() - self.sim.step() - - self.sim.data.qpos[:] = init_pos_all - self.sim.data.qvel[:] = init_vel - self.sim.data.qpos[0:7] = init_pos_robot - self.sim.data.body_xpos[ball_id, :] = np.copy(self.sim.data.site_xpos[goal_final_id, :]) - np.array([0., 0., 0.329]) - - # Stabilize the system around the initial position - for i in range(0, 2000): - self.sim.data.qpos[7:] = 0. - self.sim.data.qvel[7:] = 0. - # self.sim.data.qpos[7] = -0.2 - cur_pos = self.sim.data.qpos[0:7].copy() - cur_vel = self.sim.data.qvel[0:7].copy() - trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel) - self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy() - self.sim.step() - # self.render() - - for i in range(0, 2000): - cur_pos = self.sim.data.qpos[0:7].copy() - cur_vel = self.sim.data.qvel[0:7].copy() - trq = self.p_gains * (init_pos_robot - cur_pos) + self.d_gains * (np.zeros_like(init_pos_robot) - cur_vel) - self.sim.data.qfrc_applied[0:7] = trq + self.sim.data.qfrc_bias[:7].copy() - self.sim.step() - # self.render() - - def do_simulation(self, ctrl, n_frames): - # cur_pos = self.sim.data.qpos[0:7].copy() - # cur_vel = self.sim.data.qvel[0:7].copy() - # des_pos = ctrl[:7] - # des_vel = ctrl[7:] - # trq = self.p_gains * (des_pos - cur_pos) + self.d_gains * (des_vel - cur_vel) - if self.pd_control: - self.sim.data.qfrc_applied[0:7] = ctrl + self.sim.data.qfrc_bias[:7].copy() - else: - self.sim.data.ctrl[:] = ctrl - - for _ in range(n_frames): - try: - self.sim.step() - except mujoco_py.builder.MujocoException as e: - print("Error in simulation: " + str(e)) - # error = True - # Copy the current torque as if it would have been applied until the end of the trajectory - # for i in range(k + 1, sim_time): - # torques.append(trq) - return True - - return False + return self._get_obs() def step(self, a): - # Apply gravity compensation - # if not np.all(self.sim.data.qfrc_applied[:7] == self.sim.data.qfrc_bias[:7]): - # self.sim.data.qfrc_applied[:7] = self.sim.data.qfrc_bias[:7] - reward_dist = 0.0 angular_vel = 0.0 - # if self._steps >= self.steps_before_reward: - # vec = self.get_body_com("fingertip") - self.get_body_com("target") - # reward_dist -= self.reward_weight * np.linalg.norm(vec) - # angular_vel -= np.linalg.norm(self.sim.data.qvel.flat[:self.n_links]) reward_ctrl = - np.square(a).sum() - # reward_balance = - self.balance_weight * np.abs( - # angle_normalize(np.sum(self.sim.data.qpos.flat[:self.n_links]), type="rad")) - # - # reward = reward_dist + reward_ctrl + angular_vel + reward_balance - # self.do_simulation(a, self.frame_skip) + crash = self.do_simulation(a) joint_cons_viol = self.check_traj_in_joint_limits() - crash = self.do_simulation(a, self.frame_skip) - self._q_pos.append(self.sim.data.qpos[0:7].ravel().copy()) self._q_vel.append(self.sim.data.qvel[0:7].ravel().copy()) @@ -153,17 +128,7 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle): def check_traj_in_joint_limits(self): return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min) - def check_collision(self): - for coni in range(0, sim.data.ncon): - con = sim.data.contact[coni] - - collision = con.geom1 in self.collision_ids and con.geom2 == self.ball_collision_id - collision_trans = con.geom1 == self.ball_collision_id and con.geom2 in self.collision_ids - - if collision or collision_trans: - return True - return False - + # TODO: extend observation space def _get_obs(self): theta = self.sim.data.qpos.flat[:7] return np.concatenate([ @@ -187,8 +152,9 @@ class ALRBallInACupEnv(mujoco_env.MujocoEnv, utils.EzPickle): des_vel_full[5] = des_vel[2] return des_vel_full + if __name__ == "__main__": - env = ALRBallInACupEnv() + env = ALRBallInACupEnv(reward_function=BallInACupReward) env.reset() env.render() for i in range(4000): @@ -196,7 +162,8 @@ if __name__ == "__main__": # test with random actions # ac = 0.1 * env.action_space.sample() # ac = -np.array([i, i, i]) / 10000 + np.array([env.start_pos[1], env.start_pos[3], env.start_pos[5]]) - ac = np.array([0., -0.1, 0, 0, 0, 0, 0]) + # ac = np.array([0., -10, 0, -1, 0, 1, 0]) + ac = np.array([0.,0., 0, 0, 0, 0, 0]) # ac[0] += np.pi/2 obs, rew, d, info = env.step(ac) env.render() diff --git a/alr_envs/utils/dmp_async_vec_env.py b/alr_envs/utils/dmp_async_vec_env.py index 56f4ca7..771b0d8 100644 --- a/alr_envs/utils/dmp_async_vec_env.py +++ b/alr_envs/utils/dmp_async_vec_env.py @@ -24,14 +24,14 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv): n=n_samples, fn=np.zeros) - def __call__(self, params): - return self.rollout(params) + def __call__(self, params, contexts=None): + return self.rollout(params, contexts) - def rollout_async(self, actions): + def rollout_async(self, params, contexts): """ Parameters ---------- - actions : iterable of samples from `action_space` + params : iterable of samples from `action_space` List of actions. """ self._assert_is_running() @@ -40,11 +40,17 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv): 'for a pending call to `{0}` to complete.'.format( self._state.value), self._state.value) - actions = np.atleast_2d(actions) - split_actions = np.array_split(actions, np.minimum(len(actions), self.num_envs)) - for pipe, action in zip(self.parent_pipes, split_actions): - pipe.send(('rollout', action)) - for pipe in self.parent_pipes[len(split_actions):]: + params = np.atleast_2d(params) + split_params = np.array_split(params, np.minimum(len(params), self.num_envs)) + if contexts is None: + split_contexts = np.array_split([None, ] * len(params), np.minimum(len(params), self.num_envs)) + else: + split_contexts = np.array_split(contexts, np.minimum(len(contexts), self.num_envs)) + + assert np.all([len(p) == len(c) for p, c in zip(split_params, split_contexts)]) + for pipe, param, context in zip(self.parent_pipes, split_params, split_contexts): + pipe.send(('rollout', (param, context))) + for pipe in self.parent_pipes[len(split_params):]: pipe.send(('idle', None)) self._state = AsyncState.WAITING_ROLLOUT @@ -98,8 +104,8 @@ class DmpAsyncVectorEnv(gym.vector.AsyncVectorEnv): return np.array(rewards), infos - def rollout(self, actions): - self.rollout_async(actions) + def rollout(self, actions, contexts): + self.rollout_async(actions, contexts) return self.rollout_wait() @@ -123,8 +129,8 @@ def _worker(index, env_fn, pipe, parent_pipe, shared_memory, error_queue): rewards = [] dones = [] infos = [] - for d in data: - observation, reward, done, info = env.rollout(d) + for p, c in zip(*data): + observation, reward, done, info = env.rollout(p, c) observations.append(observation) rewards.append(reward) dones.append(done) diff --git a/alr_envs/utils/dmp_env_wrapper.py b/alr_envs/utils/dmp_env_wrapper.py index 823df83..3b461f7 100644 --- a/alr_envs/utils/dmp_env_wrapper.py +++ b/alr_envs/utils/dmp_env_wrapper.py @@ -5,7 +5,19 @@ import numpy as np import gym -class DmpEnvWrapperBase(gym.Wrapper): +def get_policy_class(policy_type): + if policy_type == "motor": + from alr_envs.utils.policies import PDController + return PDController + elif policy_type == "velocity": + from alr_envs.utils.policies import VelController + return VelController + elif policy_type == "position": + from alr_envs.utils.policies import PosController + return PosController + + +class DmpEnvWrapper(gym.Wrapper): def __init__(self, env, num_dof, @@ -17,8 +29,9 @@ class DmpEnvWrapperBase(gym.Wrapper): dt=0.01, learn_goal=False, post_traj_time=0., - policy=None): - super(DmpEnvWrapperBase, self).__init__(env) + policy_type=None, + weights_scale=1.): + super(DmpEnvWrapper, self).__init__(env) self.num_dof = num_dof self.num_basis = num_basis self.dim = num_dof * num_basis @@ -49,17 +62,19 @@ class DmpEnvWrapperBase(gym.Wrapper): dmp_goal_pos = final_pos self.dmp.set_weights(dmp_weights, dmp_goal_pos) + self.weights_scale = weights_scale - self.policy = policy + policy_class = get_policy_class(policy_type) + self.policy = policy_class(env) - def __call__(self, params): + def __call__(self, params, contexts=None): params = np.atleast_2d(params) observations = [] rewards = [] dones = [] infos = [] - for p in params: - observation, reward, done, info = self.rollout(p) + for p, c in zip(params, contexts): + observation, reward, done, info = self.rollout(p, c) observations.append(observation) rewards.append(reward) dones.append(done) @@ -81,82 +96,11 @@ class DmpEnvWrapperBase(gym.Wrapper): goal_pos = None weight_matrix = np.reshape(params, [self.num_basis, self.num_dof]) - return goal_pos, weight_matrix + return goal_pos, weight_matrix * self.weights_scale - def rollout(self, params, render=False): + def rollout(self, params, context=None, render=False): """ This function generates a trajectory based on a DMP and then does the usual loop over reset and step""" - raise NotImplementedError - - -class DmpEnvWrapperPos(DmpEnvWrapperBase): - """ - Wrapper for gym environments which creates a trajectory in joint angle space - """ - def rollout(self, action, render=False): - goal_pos, weight_matrix = self.goal_and_weights(action) - if hasattr(self.env, "weight_matrix_scale"): - weight_matrix = weight_matrix * self.env.weight_matrix_scale - self.dmp.set_weights(weight_matrix, goal_pos) - trajectory, _ = self.dmp.reference_trajectory(self.t) - - if self.post_traj_steps > 0: - trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])]) - - self._trajectory = trajectory - - rews = [] - - self.env.reset() - - for t, traj in enumerate(trajectory): - obs, rew, done, info = self.env.step(traj) - rews.append(rew) - if render: - self.env.render(mode="human") - if done: - break - - reward = np.sum(rews) - - return obs, reward, done, info - - -class DmpEnvWrapperVel(DmpEnvWrapperBase): - """ - Wrapper for gym environments which creates a trajectory in joint velocity space - """ - def rollout(self, action, render=False): - goal_pos, weight_matrix = self.goal_and_weights(action) - if hasattr(self.env, "weight_matrix_scale"): - weight_matrix = weight_matrix * self.env.weight_matrix_scale - self.dmp.set_weights(weight_matrix, goal_pos) - _, velocities = self.dmp.reference_trajectory(self.t) - - rews = [] - infos = [] - - self.env.reset() - - for t, vel in enumerate(velocities): - obs, rew, done, info = self.env.step(vel) - rews.append(rew) - infos.append(info) - if render: - self.env.render(mode="human") - if done: - break - - reward = np.sum(rews) - - return obs, reward, done, info - - -class DmpEnvWrapperPD(DmpEnvWrapperBase): - """ - Wrapper for gym environments which creates a trajectory in joint velocity space - """ - def rollout(self, action, render=False): - goal_pos, weight_matrix = self.goal_and_weights(action) + goal_pos, weight_matrix = self.goal_and_weights(params) if hasattr(self.env, "weight_matrix_scale"): weight_matrix = weight_matrix * self.env.weight_matrix_scale self.dmp.set_weights(weight_matrix, goal_pos) @@ -173,9 +117,11 @@ class DmpEnvWrapperPD(DmpEnvWrapperBase): infos = [] self.env.reset() + if context is not None: + self.env.configure(context) for t, pos_vel in enumerate(zip(trajectory, velocity)): - ac = self.policy.get_action(self.env, pos_vel[0], pos_vel[1]) + ac = self.policy.get_action(pos_vel[0], pos_vel[1]) obs, rew, done, info = self.env.step(ac) rews.append(rew) infos.append(info) diff --git a/alr_envs/utils/policies.py b/alr_envs/utils/policies.py index 5d251fa..0a4481c 100644 --- a/alr_envs/utils/policies.py +++ b/alr_envs/utils/policies.py @@ -1,15 +1,37 @@ -class PDController: - def __init__(self, p_gains, d_gains): - self.p_gains = p_gains - self.d_gains = d_gains +from alr_envs.mujoco.alr_mujoco_env import AlrMujocoEnv - def get_action(self, env, des_pos, des_vel): + +class BaseController: + def __init__(self, env: AlrMujocoEnv): + self.env = env + + def get_action(self, des_pos, des_vel): + raise NotImplementedError + + +class PosController(BaseController): + def get_action(self, des_pos, des_vel): + return des_pos + + +class VelController(BaseController): + def get_action(self, des_pos, des_vel): + return des_vel + + +class PDController(BaseController): + def __init__(self, env): + self.p_gains = env.p_gains + self.d_gains = env.d_gains + super(PDController, self).__init__(env) + + def get_action(self, des_pos, des_vel): # TODO: make standardized ALRenv such that all of them have current_pos/vel attributes - cur_pos = env.current_pos - cur_vel = env.current_vel + cur_pos = self.env.current_pos + cur_vel = self.env.current_vel if len(des_pos) != len(cur_pos): - des_pos = env.extend_des_pos(des_pos) + des_pos = self.env.extend_des_pos(des_pos) if len(des_vel) != len(cur_vel): - des_vel = env.extend_des_vel(des_vel) + des_vel = self.env.extend_des_vel(des_vel) trq = self.p_gains * (des_pos - cur_pos) + self.d_gains * (des_vel - cur_vel) return trq diff --git a/dmp_pd_control_example.py b/dmp_pd_control_example.py index 29e7cda..9c4bec0 100644 --- a/dmp_pd_control_example.py +++ b/dmp_pd_control_example.py @@ -1,7 +1,7 @@ -from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapperPD -from alr_envs.utils.policies import PDController +from alr_envs.utils.dmp_env_wrapper import DmpEnvWrapper from alr_envs.utils.dmp_async_vec_env import DmpAsyncVectorEnv, _worker -from alr_envs.mujoco.reacher.alr_reacher import ALRReacherEnv +from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv +from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward import numpy as np @@ -17,32 +17,42 @@ if __name__ == "__main__": :param rank: (int) index of the subprocess """ def _init(): - p_gains = np.array([100, 100, 100, 100, 100]) - d_gains = np.array([1, 1, 1, 1, 1]) - policy = PDController(p_gains, d_gains) + _env = ALRBallInACupEnv(BallInACupReward) - env = ALRReacherEnv() - - env = DmpEnvWrapperPD(env, - num_dof=5, - num_basis=5, - duration=4, - dt=env.dt, - learn_goal=False, - start_pos=env.init_qpos[:5], - final_pos=env.init_qpos[:5], - alpha_phase=2, - policy=policy - ) - env.seed(seed + rank) - return env + _env = DmpEnvWrapper(_env, + num_dof=3, + num_basis=8, + duration=3.5, + alpha_phase=3, + dt=_env.dt, + learn_goal=False, + start_pos=_env.start_pos[1::2], + final_pos=_env.start_pos[1::2], + policy_type="motor" + # contextual=True + ) + _env.seed(seed + rank) + return _env return _init - dim = 25 - env = make_env(0, 0)() + dim = 24 + + n_samples = 10 + + vec_env = DmpAsyncVectorEnv([make_env(i) for i in range(4)], + n_samples=n_samples, + context="spawn", + shared_memory=False, + worker=_worker) + + params = 10 * np.random.randn(n_samples, dim) + + out = vec_env(params) + + non_vec_env = make_env(0, 0)() params = 10 * np.random.randn(dim) - out = env.rollout(params, render=True) + out2 = non_vec_env.rollout(params, render=True) - print(out) + print(out2)