diff --git a/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml b/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml index c8c2643..490c853 100644 --- a/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml +++ b/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml @@ -327,7 +327,7 @@ - + diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py index 646c433..e59c87d 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py @@ -41,54 +41,63 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): reward_function = BallInACupReward self.reward_function = reward_function(self.sim_steps) + @property + def current_pos(self): + return self.sim.data.qpos[0:7].copy() + + @property + def current_vel(self): + return self.sim.data.qvel[0:7].copy() + def configure(self, context): self.context = context self.reward_function.reset(context) def reset_model(self): - start_pos = self.init_qpos.copy() - start_pos[0:7] = self.start_pos - start_vel = np.zeros_like(start_pos) - self.set_state(start_pos, start_vel) + init_pos_all = self.init_qpos.copy() + init_pos_robot = self.start_pos + init_vel = np.zeros_like(init_pos_all) + self._steps = 0 self._q_pos = [] + self._q_vel = [] + + start_pos = init_pos_all + start_pos[0:7] = init_pos_robot + + self.set_state(start_pos, init_vel) + + return self._get_obs() def step(self, a): - # Apply gravity compensation - if not np.all(self.sim.data.qfrc_applied[:7] == self.sim.data.qfrc_bias[:7]): - self.sim.data.qfrc_applied[:7] = self.sim.data.qfrc_bias[:7] - reward_dist = 0.0 angular_vel = 0.0 - # if self._steps >= self.steps_before_reward: - # vec = self.get_body_com("fingertip") - self.get_body_com("target") - # reward_dist -= self.reward_weight * np.linalg.norm(vec) - # angular_vel -= np.linalg.norm(self.sim.data.qvel.flat[:self.n_links]) reward_ctrl = - np.square(a).sum() - # reward_balance = - self.balance_weight * np.abs( - # angle_normalize(np.sum(self.sim.data.qpos.flat[:self.n_links]), type="rad")) - # - # reward = reward_dist + reward_ctrl + angular_vel + reward_balance - # self.do_simulation(a, self.frame_skip) - crash = self.do_simulation(a, self.frame_skip) + crash = self.do_simulation(a) + joint_cons_viol = self.check_traj_in_joint_limits() self._q_pos.append(self.sim.data.qpos[0:7].ravel().copy()) + self._q_vel.append(self.sim.data.qvel[0:7].ravel().copy()) ob = self._get_obs() - if not crash: - reward, success, collision = self.reward_function.compute_reward(a, self.sim, self._steps) - done = success or self._steps == self.sim_steps - 1 or collision + if not crash and not joint_cons_viol: + reward, success, stop_sim = self.reward_function.compute_reward(a, self.sim, self._steps) + done = success or self._steps == self.sim_steps - 1 or stop_sim self._steps += 1 else: reward = -1000 + success = False done = True - return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl, - velocity=angular_vel, # reward_balance=reward_balance, - # end_effector=self.get_body_com("fingertip").copy(), - goal=self.goal if hasattr(self, "goal") else None, - traj=self._q_pos) + return ob, reward, done, dict(reward_dist=reward_dist, + reward_ctrl=reward_ctrl, + velocity=angular_vel, + traj=self._q_pos, is_success=success, + is_collided=crash or joint_cons_viol) + + def check_traj_in_joint_limits(self): + return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min) def _get_obs(self): theta = self.sim.data.qpos.flat[:7] @@ -103,13 +112,15 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): if __name__ == "__main__": env = ALRBallInACupEnv() - env.configure(None) + ctxt = np.array([-0.20869846, -0.66376693, 1.18088501]) + + env.configure(ctxt) env.reset() + env.render() for i in range(2000): - # objective.load_result("/tmp/cma") # test with random actions - # ac = 0.0 * env.action_space.sample() - ac = env.start_pos + ac = 0.01 * env.action_space.sample()[0:7] + # ac = env.start_pos # ac[0] += np.pi/2 obs, rew, d, info = env.step(ac) env.render() diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py index 76a4ec6..525fe34 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward.py @@ -23,6 +23,7 @@ class BallInACupReward(alr_reward_fct.AlrReward): self.ball_traj = None self.dists = None + self.dists_ctxt = None self.dists_final = None self.costs = None @@ -31,11 +32,12 @@ class BallInACupReward(alr_reward_fct.AlrReward): def reset(self, context): self.ball_traj = np.zeros(shape=(self.sim_time, 3)) self.dists = [] + self.dists_ctxt = [] self.dists_final = [] self.costs = [] self.context = context - def compute_reward(self, action, sim, step, context=None): + def compute_reward(self, action, sim, step): self.ball_id = sim.model._body_name2id["ball"] self.ball_collision_id = sim.model._geom_name2id["ball_geom"] self.goal_id = sim.model._site_name2id["cup_goal"] @@ -49,27 +51,42 @@ class BallInACupReward(alr_reward_fct.AlrReward): ball_pos = sim.data.body_xpos[self.ball_id] goal_final_pos = sim.data.site_xpos[self.goal_final_id] self.dists.append(np.linalg.norm(goal_pos - ball_pos)) + self.dists_ctxt.append(np.linalg.norm(ball_pos - self.context)) self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) self.ball_traj[step, :] = ball_pos action_cost = np.sum(np.square(action)) + stop_sim = False + success = False + if self.check_collision(sim): reward = - 1e-5 * action_cost - 1000 - return reward, False, True + stop_sim = True + return reward, success, stop_sim - if step == self.sim_time - 1: + if ball_in_cup or step == self.sim_time - 1: min_dist = np.min(self.dists) dist_final = self.dists_final[-1] + dist_ctxt = self.dists_ctxt[-1] - cost = 0.5 * min_dist + 0.5 * dist_final - reward = np.exp(-2 * cost) - 1e-5 * action_cost + cost = self._get_stage_wise_cost(ball_in_cup, min_dist, dist_final, dist_ctxt) + reward = np.exp(-1 * cost) - 1e-5 * action_cost + stop_sim = True success = dist_final < 0.05 and ball_in_cup else: reward = - 1e-5 * action_cost success = False - return reward, success, False + return reward, success, stop_sim + + def _get_stage_wise_cost(self, ball_in_cup, min_dist, dist_final, dist_to_ctxt): + if not ball_in_cup: + cost = 3 + 2*(0.5 * min_dist**2 + 0.5 * dist_final**2) + else: + cost = 2 * dist_to_ctxt ** 2 + print('Context Distance:', dist_to_ctxt) + return cost def check_ball_in_cup(self, sim, ball_collision_id): cup_base_collision_id = sim.model._geom_name2id["cup_base_contact"] diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py index ab7e332..2356d9e 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_simple.py @@ -88,12 +88,12 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): self._steps += 1 else: reward = -1000 + success = False done = True - return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl, - velocity=angular_vel, # reward_balance=reward_balance, - # end_effector=self.get_body_com("fingertip").copy(), - goal=self.goal if hasattr(self, "goal") else None, - traj=self._q_pos, + return ob, reward, done, dict(reward_dist=reward_dist, + reward_ctrl=reward_ctrl, + velocity=angular_vel, + traj=self._q_pos, is_success=success, is_collided=crash or joint_cons_viol) def check_traj_in_joint_limits(self): diff --git a/alr_envs/mujoco/ball_in_a_cup/utils.py b/alr_envs/mujoco/ball_in_a_cup/utils.py index 27df36e..8e94670 100644 --- a/alr_envs/mujoco/ball_in_a_cup/utils.py +++ b/alr_envs/mujoco/ball_in_a_cup/utils.py @@ -3,7 +3,39 @@ from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv as ALRBallInACupEnvSimple -# TODO: add make_env for standard biac +def make_env(rank, seed=0): + """ + Utility function for multiprocessed env. + + :param env_id: (str) the environment ID + :param num_env: (int) the number of environments you wish to have in subprocesses + :param seed: (int) the initial seed for RNG + :param rank: (int) index of the subprocess + :returns a function that generates an environment + """ + + def _init(): + env = ALRBallInACupEnv() + + env = DetPMPEnvWrapper(env, + num_dof=7, + num_basis=5, + width=0.005, + policy_type="motor", + start_pos=env.start_pos, + duration=3.5, + post_traj_time=4.5, + dt=env.dt, + weights_scale=0.1, + zero_start=True, + zero_goal=False + ) + + env.seed(seed + rank) + return env + + return _init + def make_simple_env(rank, seed=0): @@ -30,7 +62,8 @@ def make_simple_env(rank, seed=0): post_traj_time=4.5, dt=env.dt, weights_scale=0.1, - zero_centered=True + zero_start=True, + zero_goal=True ) env.seed(seed + rank) diff --git a/alr_envs/utils/detpmp_env_wrapper.py b/alr_envs/utils/detpmp_env_wrapper.py index 8d80c0f..3e72c06 100644 --- a/alr_envs/utils/detpmp_env_wrapper.py +++ b/alr_envs/utils/detpmp_env_wrapper.py @@ -16,13 +16,15 @@ class DetPMPEnvWrapper(gym.Wrapper): post_traj_time=0., policy_type=None, weights_scale=1, - zero_centered=False): + zero_start=False, + zero_goal=False, + ): super(DetPMPEnvWrapper, self).__init__(env) self.num_dof = num_dof self.num_basis = num_basis self.dim = num_dof * num_basis self.pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=0.01, - zero_centered=zero_centered) + zero_start=zero_start, zero_goal=zero_goal) weights = np.zeros(shape=(num_basis, num_dof)) self.pmp.set_weights(duration, weights) self.weights_scale = weights_scale @@ -32,7 +34,7 @@ class DetPMPEnvWrapper(gym.Wrapper): self.post_traj_steps = int(post_traj_time / dt) self.start_pos = start_pos - self.zero_centered = zero_centered + self.zero_centered = zero_start policy_class = get_policy_class(policy_type) self.policy = policy_class(env)