diff --git a/alr_envs/__init__.py b/alr_envs/__init__.py index 2a8054c..986265c 100644 --- a/alr_envs/__init__.py +++ b/alr_envs/__init__.py @@ -280,7 +280,7 @@ for v in versions: "num_dof": 5, "num_basis": 5, "duration": 2, - "width": 0.01, + "width": 0.025, "policy_type": "velocity", "weights_scale": 0.2, "zero_start": True @@ -352,7 +352,8 @@ register( "num_basis": 5, "duration": 3.5, "post_traj_time": 4.5, - "width": 0.005, + "width": 0.0035, + # "off": -0.05, "policy_type": "motor", "weights_scale": 0.2, "zero_start": True, diff --git a/alr_envs/classic_control/hole_reacher.py b/alr_envs/classic_control/hole_reacher.py index 1880ce3..730e7bf 100644 --- a/alr_envs/classic_control/hole_reacher.py +++ b/alr_envs/classic_control/hole_reacher.py @@ -73,12 +73,13 @@ class HoleReacherEnv(AlrEnv): acc = (action - self._angle_velocity) / self.dt self._angle_velocity = action - self._joint_angles = self._joint_angles + self.dt * self._angle_velocity + self._joint_angles = self._joint_angles + self.dt * self._angle_velocity # + 0.001 * np.random.randn(5) self._update_joints() reward, info = self._get_reward(acc) info.update({"is_collided": self._is_collided}) + self.end_effector_traj.append(np.copy(self.end_effector)) self._steps += 1 done = self._is_collided @@ -101,6 +102,7 @@ class HoleReacherEnv(AlrEnv): self._joints = np.zeros((self.n_links + 1, 2)) self._update_joints() self._steps = 0 + self.end_effector_traj = [] return self._get_obs().copy() diff --git a/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml b/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml index 58f0ac6..9229ad5 100644 --- a/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml +++ b/alr_envs/mujoco/ball_in_a_cup/assets/biac_base.xml @@ -126,6 +126,7 @@ + diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py index 613b631..345b3ce 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py @@ -90,26 +90,27 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): reward_ctrl = - np.square(a).sum() crash = self.do_simulation(a) - joint_cons_viol = self.check_traj_in_joint_limits() + # joint_cons_viol = self.check_traj_in_joint_limits() self._q_pos.append(self.sim.data.qpos[0:7].ravel().copy()) self._q_vel.append(self.sim.data.qvel[0:7].ravel().copy()) ob = self._get_obs() - if not crash and not joint_cons_viol: - reward, success, stop_sim = self.reward_function.compute_reward(a, self.sim, self._steps) - done = success or self._steps == self.sim_steps - 1 or stop_sim + if not crash: + reward, success, is_collided = self.reward_function.compute_reward(a, self) + done = success or self._steps == self.sim_steps - 1 or is_collided self._steps += 1 else: - reward = -1000 + reward = -2 success = False + is_collided = False done = True return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl, velocity=angular_vel, traj=self._q_pos, is_success=success, - is_collided=crash or joint_cons_viol) + is_collided=is_collided, sim_crash=crash) def check_traj_in_joint_limits(self): return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min) diff --git a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py index 13053eb..79987d6 100644 --- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py +++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py @@ -6,7 +6,8 @@ class BallInACupReward(alr_reward_fct.AlrReward): def __init__(self, sim_time): self.sim_time = sim_time - self.collision_objects = ["cup_geom1", "cup_geom2", "wrist_palm_link_convex_geom", + self.collision_objects = ["cup_geom1", "cup_geom2", "cup_base_contact_below", + "wrist_palm_link_convex_geom", "wrist_pitch_link_convex_decomposition_p1_geom", "wrist_pitch_link_convex_decomposition_p2_geom", "wrist_pitch_link_convex_decomposition_p3_geom", @@ -20,6 +21,8 @@ class BallInACupReward(alr_reward_fct.AlrReward): self.goal_id = None self.goal_final_id = None self.collision_ids = None + self._is_collided = False + self.collision_penalty = 1 self.ball_traj = None self.dists = None @@ -36,49 +39,52 @@ class BallInACupReward(alr_reward_fct.AlrReward): self.action_costs = [] self.cup_angles = [] - def compute_reward(self, action, sim, step, context=None): - self.ball_id = sim.model._body_name2id["ball"] - self.ball_collision_id = sim.model._geom_name2id["ball_geom"] - self.goal_id = sim.model._site_name2id["cup_goal"] - self.goal_final_id = sim.model._site_name2id["cup_goal_final"] - self.collision_ids = [sim.model._geom_name2id[name] for name in self.collision_objects] + def compute_reward(self, action, env): + self.ball_id = env.sim.model._body_name2id["ball"] + self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"] + self.goal_id = env.sim.model._site_name2id["cup_goal"] + self.goal_final_id = env.sim.model._site_name2id["cup_goal_final"] + self.collision_ids = [env.sim.model._geom_name2id[name] for name in self.collision_objects] - ball_in_cup = self.check_ball_in_cup(sim, self.ball_collision_id) + ball_in_cup = self.check_ball_in_cup(env.sim, self.ball_collision_id) # Compute the current distance from the ball to the inner part of the cup - goal_pos = sim.data.site_xpos[self.goal_id] - ball_pos = sim.data.body_xpos[self.ball_id] - goal_final_pos = sim.data.site_xpos[self.goal_final_id] + goal_pos = env.sim.data.site_xpos[self.goal_id] + ball_pos = env.sim.data.body_xpos[self.ball_id] + goal_final_pos = env.sim.data.site_xpos[self.goal_final_id] self.dists.append(np.linalg.norm(goal_pos - ball_pos)) self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) - self.ball_traj[step, :] = ball_pos - cup_quat = np.copy(sim.data.body_xquat[sim.model._body_name2id["cup"]]) + self.ball_traj[env._steps, :] = ball_pos + cup_quat = np.copy(env.sim.data.body_xquat[env.sim.model._body_name2id["cup"]]) self.cup_angles.append(np.arctan2(2 * (cup_quat[0] * cup_quat[1] + cup_quat[2] * cup_quat[3]), 1 - 2 * (cup_quat[1]**2 + cup_quat[2]**2))) action_cost = np.sum(np.square(action)) self.action_costs.append(action_cost) - if self.check_collision(sim): - reward = - 1000 - return reward, False, True + self._is_collided = self.check_collision(env.sim) or env.check_traj_in_joint_limits() - if step == self.sim_time - 1: + if env._steps == env.sim_steps - 1 or self._is_collided: t_min_dist = np.argmin(self.dists) angle_min_dist = self.cup_angles[t_min_dist] cost_angle = (angle_min_dist - np.pi / 2)**2 min_dist = self.dists[t_min_dist] dist_final = self.dists_final[-1] + min_dist_final = np.min(self.dists_final) - cost = 0.5 * min_dist + 0.5 * dist_final + 0.01 * cost_angle - reward = np.exp(-2 * cost) - 1e-3 * action_cost - success = dist_final < 0.05 and ball_in_cup + cost = 0.5 * dist_final + 0.05 * cost_angle # TODO: Increase cost_angle weight # 0.5 * min_dist + + # reward = np.exp(-2 * cost) - 1e-2 * action_cost - self.collision_penalty * int(self._is_collided) + # reward = - dist_final**2 - 1e-4 * cost_angle - 1e-5 * action_cost - self.collision_penalty * int(self._is_collided) + reward = - dist_final**2 - min_dist_final**2 - 1e-4 * cost_angle - 1e-5 * action_cost - self.collision_penalty * int(self._is_collided) + success = dist_final < 0.05 and ball_in_cup and not self._is_collided + crash = self._is_collided else: - reward = - 1e-3 * action_cost + reward = - 1e-5 * action_cost # TODO: increase action_cost weight success = False + crash = False - return reward, success, False + return reward, success, crash def check_ball_in_cup(self, sim, ball_collision_id): cup_base_collision_id = sim.model._geom_name2id["cup_base_contact"] diff --git a/alr_envs/utils/mps/detpmp_wrapper.py b/alr_envs/utils/mps/detpmp_wrapper.py index 98060e2..8661781 100644 --- a/alr_envs/utils/mps/detpmp_wrapper.py +++ b/alr_envs/utils/mps/detpmp_wrapper.py @@ -12,16 +12,16 @@ class DetPMPWrapper(MPWrapper): zero_start: bool = False, zero_goal: bool = False, **mp_kwargs): self.duration = duration # seconds + dt = env.dt if hasattr(env, "dt") else dt + assert dt is not None + self.dt = dt + super().__init__(env, num_dof, dt, duration, post_traj_time, policy_type, weights_scale, num_basis=num_basis, width=width, zero_start=zero_start, zero_goal=zero_goal, **mp_kwargs) - self.dt = env.dt if hasattr(env, "dt") else dt - assert self.dt is not None - action_bounds = np.inf * np.ones((self.mp.n_basis * self.mp.n_dof)) self.action_space = gym.spaces.Box(low=-action_bounds, high=action_bounds, dtype=np.float32) - def initialize_mp(self, num_dof: int, duration: int, dt: float, num_basis: int = 5, width: float = None, off: float = 0.01, zero_start: bool = False, zero_goal: bool = False): pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=off,