diff --git a/alr_envs/__init__.py b/alr_envs/__init__.py index fd79c25..2d118e4 100644 --- a/alr_envs/__init__.py +++ b/alr_envs/__init__.py @@ -71,6 +71,24 @@ register( } ) +register( + id='ALRBallInACupSimple-v0', + entry_point='alr_envs.mujoco:ALRBallInACupEnv', + max_episode_steps=4000, + kwargs={ + "reward_type": "simple" + } +) + +register( + id='ALRBallInACupGoal-v0', + entry_point='alr_envs.mujoco:ALRBallInACupEnv', + max_episode_steps=4000, + kwargs={ + "reward_type": "contextual_goal" + } +) + # Classic control register( @@ -180,18 +198,36 @@ register( } ) +# register( +# id='HoleReacherDetPMP-v0', +# entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp', +# # max_episode_steps=1, +# # TODO: add mp kwargs +# ) + register( - id='HoleReacherDetPMP-v0', - entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp', - # max_episode_steps=1, - # TODO: add mp kwargs + id='ALRBallInACupSimpleDMP-v0', + entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', + kwargs={ + "name": "alr_envs:ALRBallInACupSimple-v0", + "num_dof": 3, + "num_basis": 5, + "duration": 3.5, + "post_traj_time": 4.5, + "learn_goal": False, + "alpha_phase": 3, + "bandwidth_factor": 2.5, + "policy_type": "motor", + "weights_scale": 100, + "return_to_start": True + } ) register( - id='BiacSimpleDMP-v0', + id='ALRBallInACupGoalDMP-v0', entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', kwargs={ - "name": "alr_envs:HoleReacher-v0", + "name": "alr_envs:ALRBallInACupGoal-v0", "num_dof": 5, "num_basis": 5, "duration": 2, diff --git a/alr_envs/mujoco/__init__.py b/alr_envs/mujoco/__init__.py index 185f0ed..eb7b41f 100644 --- a/alr_envs/mujoco/__init__.py +++ b/alr_envs/mujoco/__init__.py @@ -1,2 +1,3 @@ from alr_envs.mujoco.reacher.alr_reacher import ALRReacherEnv from alr_envs.mujoco.balancing import BalancingEnv +from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv diff --git a/alr_envs/mujoco/alr_mujoco_env.py b/alr_envs/mujoco/alr_mujoco_env.py index c58dfe7..edc90b6 100644 --- a/alr_envs/mujoco/alr_mujoco_env.py +++ b/alr_envs/mujoco/alr_mujoco_env.py @@ -67,6 +67,9 @@ class AlrMujocoEnv(gym.Env): self.init_qpos = self.sim.data.qpos.ravel().copy() self.init_qvel = self.sim.data.qvel.ravel().copy() + self._start_pos = None + self._start_vel = None + self._set_action_space() # action = self.action_space.sample() @@ -93,6 +96,20 @@ class AlrMujocoEnv(gym.Env): """ return self.sim.data.qvel + @property + def start_pos(self): + """ + Start position of the agent, for example joint angles of a Panda robot. Necessary for MP wrapped envs. + """ + return self._start_pos + + @property + def start_vel(self): + """ + Start velocity of the agent. Necessary for MP wrapped envs. + """ + return self._start_vel + def extend_des_pos(self, des_pos): """ In a simplified environment, the actions may only control a subset of all the joints. diff --git a/alr_envs/mujoco/ball_in_a_cup/__init__.py b/alr_envs/mujoco/ball_in_a_cup/__init__.py index b884e38..8b13789 100644 --- a/alr_envs/mujoco/ball_in_a_cup/__init__.py +++ b/alr_envs/mujoco/ball_in_a_cup/__init__.py @@ -1 +1 @@ -from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv + diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py index fa894e5..19c6295 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py @@ -5,15 +5,12 @@ from alr_envs.mujoco import alr_mujoco_env class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): - def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None): + def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_type: str = None, context: np.ndarray = None): self._steps = 0 self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "biac_base" + ".xml") - self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57]) - self.start_vel = np.zeros(7) - self._q_pos = [] self._q_vel = [] # self.weight_matrix_scale = 50 @@ -31,13 +28,21 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): self.xml_path, apply_gravity_comp=apply_gravity_comp, n_substeps=n_substeps) + self._start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57]) + self._start_vel = np.zeros(7) self.sim_time = 8 # seconds self.sim_steps = int(self.sim_time / self.dt) - if reward_function is None: + if reward_type == "simple": + from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward + reward_function = BallInACupReward + elif reward_type == "contextual_goal": from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward reward_function = BallInACupReward + else: + raise ValueError("Unknown reward type") self.reward_function = reward_function(self.sim_steps) + self.configure(context) @property def current_pos(self): @@ -97,6 +102,7 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): def check_traj_in_joint_limits(self): return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min) + # TODO: extend observation space def _get_obs(self): theta = self.sim.data.qpos.flat[:7] return np.concatenate([ @@ -106,6 +112,20 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): [self._steps], ]) + # These functions are for the task with 3 joint actuations + def extend_des_pos(self, des_pos): + des_pos_full = self.start_pos.copy() + des_pos_full[1] = des_pos[0] + des_pos_full[3] = des_pos[1] + des_pos_full[5] = des_pos[2] + return des_pos_full + + def extend_des_vel(self, des_vel): + des_vel_full = self.start_vel.copy() + des_vel_full[1] = des_vel[0] + des_vel_full[3] = des_vel[1] + des_vel_full[5] = des_vel[2] + return des_vel_full if __name__ == "__main__": diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py index 6a9a574..e34aecf 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py @@ -40,9 +40,12 @@ class BallInACupReward(alr_reward_fct.AlrReward): self.ball_in_cup = False self.ball_above_threshold = False self.dist_ctxt = 3 + self.action_costs = [] + self.cup_angles = [] def compute_reward(self, action, sim, step): action_cost = np.sum(np.square(action)) + self.action_costs.append(action_cost) stop_sim = False success = False @@ -54,7 +57,7 @@ class BallInACupReward(alr_reward_fct.AlrReward): self.collision_ids = [sim.model._geom_name2id[name] for name in self.collision_objects] if self.check_collision(sim): - reward = - 1e-4 * action_cost - 1000 + reward = - 1e-3 * action_cost - 1000 stop_sim = True return reward, success, stop_sim @@ -65,8 +68,11 @@ class BallInACupReward(alr_reward_fct.AlrReward): self.dists.append(np.linalg.norm(goal_pos - ball_pos)) self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) self.dists_ctxt.append(np.linalg.norm(ball_pos - self.context)) - self.ball_traj[step, :] = ball_pos - self.cup_traj[step, :] = goal_pos + self.ball_traj[step, :] = np.copy(ball_pos) + self.cup_traj[step, :] = np.copy(goal_pos) # ? + cup_quat = np.copy(sim.data.body_xquat[sim.model._body_name2id["cup"]]) + self.cup_angles.append(np.arctan2(2 * (cup_quat[0] * cup_quat[1] + cup_quat[2] * cup_quat[3]), + 1 - 2 * (cup_quat[1] ** 2 + cup_quat[2] ** 2))) # Determine the first time when ball is in cup if not self.ball_in_cup: @@ -77,25 +83,29 @@ class BallInACupReward(alr_reward_fct.AlrReward): self.dist_ctxt = dist_to_ctxt if step == self.sim_time - 1: + t_min_dist = np.argmin(self.dists) + angle_min_dist = self.cup_angles[t_min_dist] + cost_angle = (angle_min_dist - np.pi / 2) ** 2 + min_dist = np.min(self.dists) dist_final = self.dists_final[-1] # dist_ctxt = self.dists_ctxt[-1] - # max distance between ball and cup and cup height at that time - ball_to_cup_diff = self.ball_traj[:, 2] - self.cup_traj[:, 2] - t_max_diff = np.argmax(ball_to_cup_diff) - t_max_ball_height = np.argmax(self.ball_traj[:, 2]) - max_ball_height = np.max(self.ball_traj[:, 2]) + # # max distance between ball and cup and cup height at that time + # ball_to_cup_diff = self.ball_traj[:, 2] - self.cup_traj[:, 2] + # t_max_diff = np.argmax(ball_to_cup_diff) + # t_max_ball_height = np.argmax(self.ball_traj[:, 2]) + # max_ball_height = np.max(self.ball_traj[:, 2]) # cost = self._get_stage_wise_cost(ball_in_cup, min_dist, dist_final, dist_ctxt) - cost = 0.3 * min_dist + 0.3 * dist_final + 0.3 * np.minimum(self.dist_ctxt, 3) - reward = np.exp(-1 * cost) - 1e-4 * action_cost - if max_ball_height < self.context[2] or ball_to_cup_diff[t_max_ball_height] < 0: - reward -= 1 + cost = 0.5 * min_dist + 0.5 * dist_final + 0.3 * np.minimum(self.dist_ctxt, 3) + 0.01 * cost_angle + reward = np.exp(-2 * cost) - 1e-3 * action_cost + # if max_ball_height < self.context[2] or ball_to_cup_diff[t_max_ball_height] < 0: + # reward -= 1 success = dist_final < 0.05 and self.dist_ctxt < 0.05 else: - reward = - 1e-4 * action_cost + reward = - 1e-3 * action_cost success = False return reward, success, stop_sim diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py deleted file mode 100644 index 2356d9e..0000000 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py +++ /dev/null @@ -1,151 +0,0 @@ -from alr_envs.mujoco import alr_mujoco_env -from gym import utils -import os -import numpy as np - - -class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): - def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None): - self._steps = 0 - - self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", - "biac_base" + ".xml") - - self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57]) - self.start_vel = np.zeros(7) - - self._q_pos = [] - self._q_vel = [] - # self.weight_matrix_scale = 50 - self.max_ctrl = np.array([150., 125., 40., 60., 5., 5., 2.]) - self.p_gains = 1 / self.max_ctrl * np.array([200, 300, 100, 100, 10, 10, 2.5]) - self.d_gains = 1 / self.max_ctrl * np.array([7, 15, 5, 2.5, 0.3, 0.3, 0.05]) - - self.j_min = np.array([-2.6, -1.985, -2.8, -0.9, -4.55, -1.5707, -2.7]) - self.j_max = np.array([2.6, 1.985, 2.8, 3.14159, 1.25, 1.5707, 2.7]) - - self.context = None - - utils.EzPickle.__init__(self) - alr_mujoco_env.AlrMujocoEnv.__init__(self, - self.xml_path, - apply_gravity_comp=apply_gravity_comp, - n_substeps=n_substeps) - - self.sim_time = 8 # seconds - self.sim_steps = int(self.sim_time / self.dt) - if reward_function is None: - from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward - reward_function = BallInACupReward - self.reward_function = reward_function(self.sim_steps) - - @property - def current_pos(self): - return self.sim.data.qpos[0:7].copy() - - @property - def current_vel(self): - return self.sim.data.qvel[0:7].copy() - - def configure(self, context): - self.context = context - self.reward_function.reset(context) - - def reset_model(self): - init_pos_all = self.init_qpos.copy() - init_pos_robot = self.start_pos - init_vel = np.zeros_like(init_pos_all) - ball_id = self.sim.model._body_name2id["ball"] - goal_final_id = self.sim.model._site_name2id["cup_goal_final"] - - self._steps = 0 - self._q_pos = [] - self._q_vel = [] - - start_pos = init_pos_all - start_pos[0:7] = init_pos_robot - - self.set_state(start_pos, init_vel) - - return self._get_obs() - - def step(self, a): - reward_dist = 0.0 - angular_vel = 0.0 - reward_ctrl = - np.square(a).sum() - - crash = self.do_simulation(a) - joint_cons_viol = self.check_traj_in_joint_limits() - - self._q_pos.append(self.sim.data.qpos[0:7].ravel().copy()) - self._q_vel.append(self.sim.data.qvel[0:7].ravel().copy()) - - ob = self._get_obs() - - if not crash and not joint_cons_viol: - reward, success, collision = self.reward_function.compute_reward(a, self.sim, self._steps) - done = success or self._steps == self.sim_steps - 1 or collision - self._steps += 1 - else: - reward = -1000 - success = False - done = True - return ob, reward, done, dict(reward_dist=reward_dist, - reward_ctrl=reward_ctrl, - velocity=angular_vel, - traj=self._q_pos, is_success=success, - is_collided=crash or joint_cons_viol) - - def check_traj_in_joint_limits(self): - return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min) - - # TODO: extend observation space - def _get_obs(self): - theta = self.sim.data.qpos.flat[:7] - return np.concatenate([ - np.cos(theta), - np.sin(theta), - # self.get_body_com("target"), # only return target to make problem harder - [self._steps], - ]) - - def extend_des_pos(self, des_pos): - des_pos_full = self.start_pos.copy() - des_pos_full[1] = des_pos[0] - des_pos_full[3] = des_pos[1] - des_pos_full[5] = des_pos[2] - return des_pos_full - - def extend_des_vel(self, des_vel): - des_vel_full = self.start_vel.copy() - des_vel_full[1] = des_vel[0] - des_vel_full[3] = des_vel[1] - des_vel_full[5] = des_vel[2] - return des_vel_full - - -if __name__ == "__main__": - from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward - - env = ALRBallInACupEnv(reward_function=BallInACupReward) - env.configure(None) - env.reset() - env.render() - for i in range(4000): - # objective.load_result("/tmp/cma") - # test with random actions - # ac = 0.1 * env.action_space.sample() - # ac = -np.array([i, i, i]) / 10000 + np.array([env.start_pos[1], env.start_pos[3], env.start_pos[5]]) - # ac = np.array([0., -10, 0, -1, 0, 1, 0]) - ac = np.array([0.,0., 0, 0, 0, 0, 0]) - # ac[0] += np.pi/2 - obs, rew, d, info = env.step(ac) - env.render() - - print(rew) - - if d: - break - - env.close() - diff --git a/alr_envs/mujoco/ball_in_a_cup/utils.py b/alr_envs/mujoco/ball_in_a_cup/utils.py index 1f421f5..bfec3cf 100644 --- a/alr_envs/mujoco/ball_in_a_cup/utils.py +++ b/alr_envs/mujoco/ball_in_a_cup/utils.py @@ -1,7 +1,6 @@ from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv -from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv as ALRBallInACupEnvSimple def make_contextual_env(rank, seed=0): @@ -16,7 +15,7 @@ def make_contextual_env(rank, seed=0): """ def _init(): - env = ALRBallInACupEnv() + env = ALRBallInACupEnv(reward_type="contextual_goal") env = DetPMPWrapper(env, num_dof=7, @@ -50,7 +49,7 @@ def make_env(rank, seed=0): """ def _init(): - env = ALRBallInACupEnvSimple() + env = ALRBallInACupEnv(reward_type="simple") env = DetPMPWrapper(env, num_dof=7, @@ -84,7 +83,7 @@ def make_simple_env(rank, seed=0): """ def _init(): - env = ALRBallInACupEnvSimple() + env = ALRBallInACupEnv(reward_type="simple") env = DetPMPWrapper(env, num_dof=3, @@ -119,7 +118,7 @@ def make_simple_dmp_env(rank, seed=0): """ def _init(): - _env = ALRBallInACupEnvSimple() + _env = ALRBallInACupEnv(reward_type="simple") _env = DmpWrapper(_env, num_dof=3, diff --git a/alr_envs/utils/mp_env_async_sampler.py b/alr_envs/utils/mp_env_async_sampler.py new file mode 100644 index 0000000..68d7e2c --- /dev/null +++ b/alr_envs/utils/mp_env_async_sampler.py @@ -0,0 +1,82 @@ +import gym +from gym.vector.async_vector_env import AsyncVectorEnv +import numpy as np +from _collections import defaultdict + + +def make_env(env_id, rank, seed=0): + env = gym.make(env_id) + env.seed(seed + rank) + return lambda: env + + +def split_array(ary, size): + n_samples = len(ary) + if n_samples < size: + tmp = np.zeros((size, ary.shape[1])) + tmp[0:n_samples] = ary + return [tmp] + elif n_samples == size: + return [ary] + else: + repeat = int(np.ceil(n_samples / size)) + split = [k * size for k in range(1, repeat)] + sub_arys = np.split(ary, split) + + if n_samples % repeat != 0: + tmp = np.zeros_like(sub_arys[0]) + last = sub_arys[-1] + tmp[0: len(last)] = last + sub_arys[-1] = tmp + + return sub_arys + + +def _flatten_list(l): + assert isinstance(l, (list, tuple)) + assert len(l) > 0 + assert all([len(l_) > 0 for l_ in l]) + + return [l__ for l_ in l for l__ in l_] + + +class AlrMpEnvSampler: + """ + An asynchronous sampler for MPWrapper environments. A sampler object can be called with a set of parameters and + returns the corresponding final obs, rewards, dones and info dicts. + """ + def __init__(self, env_id, num_envs, seed=0): + self.num_envs = num_envs + self.env = AsyncVectorEnv([make_env(env_id, seed, i) for i in range(num_envs)]) + + def __call__(self, params): + params = np.atleast_2d(params) + n_samples = params.shape[0] + split_params = split_array(params, self.num_envs) + + vals = defaultdict(list) + for p in split_params: + obs, reward, done, info = self.env.step(p) + vals['obs'].append(obs) + vals['reward'].append(reward) + vals['done'].append(done) + vals['info'].append(info) + + # do not return values above threshold + return np.vstack(vals['obs'])[:n_samples], np.hstack(vals['reward'])[:n_samples],\ + _flatten_list(vals['done'])[:n_samples], _flatten_list(vals['info'])[:n_samples] + + +if __name__ == "__main__": + env_name = "alr_envs:HoleReacherDMP-v0" + n_cpu = 8 + dim = 30 + n_samples = 20 + + sampler = AlrMpEnvSampler(env_name, num_envs=n_cpu) + + thetas = np.random.randn(n_samples, dim) # usually form a search distribution + + _, rewards, __, ___ = sampler(thetas) + + print(rewards) diff --git a/alr_envs/utils/wrapper/dmp_wrapper.py b/alr_envs/utils/wrapper/dmp_wrapper.py index 7333278..d6c8f92 100644 --- a/alr_envs/utils/wrapper/dmp_wrapper.py +++ b/alr_envs/utils/wrapper/dmp_wrapper.py @@ -11,8 +11,9 @@ class DmpWrapper(MPWrapper): def __init__(self, env: gym.Env, num_dof: int, num_basis: int, start_pos: np.ndarray = None, final_pos: np.ndarray = None, duration: int = 1, alpha_phase: float = 2., dt: float = None, - learn_goal: bool = False, post_traj_time: float = 0., policy_type: str = None, - weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3.): + learn_goal: bool = False, return_to_start: bool = False, post_traj_time: float = 0., + weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3., + policy_type: str = None): """ This Wrapper generates a trajectory based on a DMP and will only return episodic performances. @@ -34,8 +35,13 @@ class DmpWrapper(MPWrapper): self.learn_goal = learn_goal dt = env.dt if hasattr(env, "dt") else dt assert dt is not None - start_pos = env.start_pos if hasattr(env, "start_pos") else start_pos + start_pos = start_pos if start_pos is not None else env.start_pos if hasattr(env, "start_pos") else None assert start_pos is not None + if learn_goal: + final_pos = np.zeros_like(start_pos) # arbitrary, will be learned + else: + final_pos = final_pos if final_pos is not None else start_pos if return_to_start else None + assert final_pos is not None self.t = np.linspace(0, duration, int(duration / dt)) self.goal_scale = goal_scale diff --git a/alr_envs/utils/wrapper/mp_wrapper.py b/alr_envs/utils/wrapper/mp_wrapper.py index c75134c..769a868 100644 --- a/alr_envs/utils/wrapper/mp_wrapper.py +++ b/alr_envs/utils/wrapper/mp_wrapper.py @@ -46,8 +46,9 @@ class MPWrapper(gym.Wrapper, ABC): rewards = [] dones = [] infos = [] - for p, c in zip(params, contexts): - self.configure(c) + # for p, c in zip(params, contexts): + for p in params: + # self.configure(c) ob, reward, done, info = self.step(p) obs.append(ob) rewards.append(reward)