biac updates
This commit is contained in:
parent
0916daf3b5
commit
7eef78d620
@ -327,7 +327,7 @@
|
|||||||
</body>
|
</body>
|
||||||
</body>
|
</body>
|
||||||
</body>
|
</body>
|
||||||
<site name="context_point" pos="0 0.05 0.1165" euler="-1.57 0 0" size="0.015" rgba="1 0 0 0.6" type="sphere"/>
|
<site name="context_point" pos="-0.20869846 -0.66376693 1.18088501" euler="-1.57 0 0" size="0.015" rgba="1 0 0 0.6" type="sphere"/>
|
||||||
<site name="context_point1" pos="-0.5 -0.85 0.8165" euler="-1.57 0 0" size="0.015" rgba="0 1 0 0.3" type="sphere"/>
|
<site name="context_point1" pos="-0.5 -0.85 0.8165" euler="-1.57 0 0" size="0.015" rgba="0 1 0 0.3" type="sphere"/>
|
||||||
<site name="context_point2" pos="-0.5 -0.85 1.4165" euler="-1.57 0 0" size="0.015" rgba="0 1 0 0.3" type="sphere"/>
|
<site name="context_point2" pos="-0.5 -0.85 1.4165" euler="-1.57 0 0" size="0.015" rgba="0 1 0 0.3" type="sphere"/>
|
||||||
<site name="context_point3" pos="-0.5 -0.35 0.8165" euler="-1.57 0 0" size="0.015" rgba="0 1 0 0.3" type="sphere"/>
|
<site name="context_point3" pos="-0.5 -0.35 0.8165" euler="-1.57 0 0" size="0.015" rgba="0 1 0 0.3" type="sphere"/>
|
||||||
|
@ -41,54 +41,63 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
|||||||
reward_function = BallInACupReward
|
reward_function = BallInACupReward
|
||||||
self.reward_function = reward_function(self.sim_steps)
|
self.reward_function = reward_function(self.sim_steps)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def current_pos(self):
|
||||||
|
return self.sim.data.qpos[0:7].copy()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def current_vel(self):
|
||||||
|
return self.sim.data.qvel[0:7].copy()
|
||||||
|
|
||||||
def configure(self, context):
|
def configure(self, context):
|
||||||
self.context = context
|
self.context = context
|
||||||
self.reward_function.reset(context)
|
self.reward_function.reset(context)
|
||||||
|
|
||||||
def reset_model(self):
|
def reset_model(self):
|
||||||
start_pos = self.init_qpos.copy()
|
init_pos_all = self.init_qpos.copy()
|
||||||
start_pos[0:7] = self.start_pos
|
init_pos_robot = self.start_pos
|
||||||
start_vel = np.zeros_like(start_pos)
|
init_vel = np.zeros_like(init_pos_all)
|
||||||
self.set_state(start_pos, start_vel)
|
|
||||||
self._steps = 0
|
self._steps = 0
|
||||||
self._q_pos = []
|
self._q_pos = []
|
||||||
|
self._q_vel = []
|
||||||
|
|
||||||
|
start_pos = init_pos_all
|
||||||
|
start_pos[0:7] = init_pos_robot
|
||||||
|
|
||||||
|
self.set_state(start_pos, init_vel)
|
||||||
|
|
||||||
|
return self._get_obs()
|
||||||
|
|
||||||
def step(self, a):
|
def step(self, a):
|
||||||
# Apply gravity compensation
|
|
||||||
if not np.all(self.sim.data.qfrc_applied[:7] == self.sim.data.qfrc_bias[:7]):
|
|
||||||
self.sim.data.qfrc_applied[:7] = self.sim.data.qfrc_bias[:7]
|
|
||||||
|
|
||||||
reward_dist = 0.0
|
reward_dist = 0.0
|
||||||
angular_vel = 0.0
|
angular_vel = 0.0
|
||||||
# if self._steps >= self.steps_before_reward:
|
|
||||||
# vec = self.get_body_com("fingertip") - self.get_body_com("target")
|
|
||||||
# reward_dist -= self.reward_weight * np.linalg.norm(vec)
|
|
||||||
# angular_vel -= np.linalg.norm(self.sim.data.qvel.flat[:self.n_links])
|
|
||||||
reward_ctrl = - np.square(a).sum()
|
reward_ctrl = - np.square(a).sum()
|
||||||
# reward_balance = - self.balance_weight * np.abs(
|
|
||||||
# angle_normalize(np.sum(self.sim.data.qpos.flat[:self.n_links]), type="rad"))
|
|
||||||
#
|
|
||||||
# reward = reward_dist + reward_ctrl + angular_vel + reward_balance
|
|
||||||
# self.do_simulation(a, self.frame_skip)
|
|
||||||
|
|
||||||
crash = self.do_simulation(a, self.frame_skip)
|
crash = self.do_simulation(a)
|
||||||
|
joint_cons_viol = self.check_traj_in_joint_limits()
|
||||||
|
|
||||||
self._q_pos.append(self.sim.data.qpos[0:7].ravel().copy())
|
self._q_pos.append(self.sim.data.qpos[0:7].ravel().copy())
|
||||||
|
self._q_vel.append(self.sim.data.qvel[0:7].ravel().copy())
|
||||||
|
|
||||||
ob = self._get_obs()
|
ob = self._get_obs()
|
||||||
|
|
||||||
if not crash:
|
if not crash and not joint_cons_viol:
|
||||||
reward, success, collision = self.reward_function.compute_reward(a, self.sim, self._steps)
|
reward, success, stop_sim = self.reward_function.compute_reward(a, self.sim, self._steps)
|
||||||
done = success or self._steps == self.sim_steps - 1 or collision
|
done = success or self._steps == self.sim_steps - 1 or stop_sim
|
||||||
self._steps += 1
|
self._steps += 1
|
||||||
else:
|
else:
|
||||||
reward = -1000
|
reward = -1000
|
||||||
|
success = False
|
||||||
done = True
|
done = True
|
||||||
return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl,
|
return ob, reward, done, dict(reward_dist=reward_dist,
|
||||||
velocity=angular_vel, # reward_balance=reward_balance,
|
reward_ctrl=reward_ctrl,
|
||||||
# end_effector=self.get_body_com("fingertip").copy(),
|
velocity=angular_vel,
|
||||||
goal=self.goal if hasattr(self, "goal") else None,
|
traj=self._q_pos, is_success=success,
|
||||||
traj=self._q_pos)
|
is_collided=crash or joint_cons_viol)
|
||||||
|
|
||||||
|
def check_traj_in_joint_limits(self):
|
||||||
|
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
|
||||||
|
|
||||||
def _get_obs(self):
|
def _get_obs(self):
|
||||||
theta = self.sim.data.qpos.flat[:7]
|
theta = self.sim.data.qpos.flat[:7]
|
||||||
@ -103,13 +112,15 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
env = ALRBallInACupEnv()
|
env = ALRBallInACupEnv()
|
||||||
env.configure(None)
|
ctxt = np.array([-0.20869846, -0.66376693, 1.18088501])
|
||||||
|
|
||||||
|
env.configure(ctxt)
|
||||||
env.reset()
|
env.reset()
|
||||||
|
env.render()
|
||||||
for i in range(2000):
|
for i in range(2000):
|
||||||
# objective.load_result("/tmp/cma")
|
|
||||||
# test with random actions
|
# test with random actions
|
||||||
# ac = 0.0 * env.action_space.sample()
|
ac = 0.01 * env.action_space.sample()[0:7]
|
||||||
ac = env.start_pos
|
# ac = env.start_pos
|
||||||
# ac[0] += np.pi/2
|
# ac[0] += np.pi/2
|
||||||
obs, rew, d, info = env.step(ac)
|
obs, rew, d, info = env.step(ac)
|
||||||
env.render()
|
env.render()
|
||||||
|
@ -23,6 +23,7 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
|||||||
|
|
||||||
self.ball_traj = None
|
self.ball_traj = None
|
||||||
self.dists = None
|
self.dists = None
|
||||||
|
self.dists_ctxt = None
|
||||||
self.dists_final = None
|
self.dists_final = None
|
||||||
self.costs = None
|
self.costs = None
|
||||||
|
|
||||||
@ -31,11 +32,12 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
|||||||
def reset(self, context):
|
def reset(self, context):
|
||||||
self.ball_traj = np.zeros(shape=(self.sim_time, 3))
|
self.ball_traj = np.zeros(shape=(self.sim_time, 3))
|
||||||
self.dists = []
|
self.dists = []
|
||||||
|
self.dists_ctxt = []
|
||||||
self.dists_final = []
|
self.dists_final = []
|
||||||
self.costs = []
|
self.costs = []
|
||||||
self.context = context
|
self.context = context
|
||||||
|
|
||||||
def compute_reward(self, action, sim, step, context=None):
|
def compute_reward(self, action, sim, step):
|
||||||
self.ball_id = sim.model._body_name2id["ball"]
|
self.ball_id = sim.model._body_name2id["ball"]
|
||||||
self.ball_collision_id = sim.model._geom_name2id["ball_geom"]
|
self.ball_collision_id = sim.model._geom_name2id["ball_geom"]
|
||||||
self.goal_id = sim.model._site_name2id["cup_goal"]
|
self.goal_id = sim.model._site_name2id["cup_goal"]
|
||||||
@ -49,27 +51,42 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
|||||||
ball_pos = sim.data.body_xpos[self.ball_id]
|
ball_pos = sim.data.body_xpos[self.ball_id]
|
||||||
goal_final_pos = sim.data.site_xpos[self.goal_final_id]
|
goal_final_pos = sim.data.site_xpos[self.goal_final_id]
|
||||||
self.dists.append(np.linalg.norm(goal_pos - ball_pos))
|
self.dists.append(np.linalg.norm(goal_pos - ball_pos))
|
||||||
|
self.dists_ctxt.append(np.linalg.norm(ball_pos - self.context))
|
||||||
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
|
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
|
||||||
self.ball_traj[step, :] = ball_pos
|
self.ball_traj[step, :] = ball_pos
|
||||||
|
|
||||||
action_cost = np.sum(np.square(action))
|
action_cost = np.sum(np.square(action))
|
||||||
|
|
||||||
|
stop_sim = False
|
||||||
|
success = False
|
||||||
|
|
||||||
if self.check_collision(sim):
|
if self.check_collision(sim):
|
||||||
reward = - 1e-5 * action_cost - 1000
|
reward = - 1e-5 * action_cost - 1000
|
||||||
return reward, False, True
|
stop_sim = True
|
||||||
|
return reward, success, stop_sim
|
||||||
|
|
||||||
if step == self.sim_time - 1:
|
if ball_in_cup or step == self.sim_time - 1:
|
||||||
min_dist = np.min(self.dists)
|
min_dist = np.min(self.dists)
|
||||||
dist_final = self.dists_final[-1]
|
dist_final = self.dists_final[-1]
|
||||||
|
dist_ctxt = self.dists_ctxt[-1]
|
||||||
|
|
||||||
cost = 0.5 * min_dist + 0.5 * dist_final
|
cost = self._get_stage_wise_cost(ball_in_cup, min_dist, dist_final, dist_ctxt)
|
||||||
reward = np.exp(-2 * cost) - 1e-5 * action_cost
|
reward = np.exp(-1 * cost) - 1e-5 * action_cost
|
||||||
|
stop_sim = True
|
||||||
success = dist_final < 0.05 and ball_in_cup
|
success = dist_final < 0.05 and ball_in_cup
|
||||||
else:
|
else:
|
||||||
reward = - 1e-5 * action_cost
|
reward = - 1e-5 * action_cost
|
||||||
success = False
|
success = False
|
||||||
|
|
||||||
return reward, success, False
|
return reward, success, stop_sim
|
||||||
|
|
||||||
|
def _get_stage_wise_cost(self, ball_in_cup, min_dist, dist_final, dist_to_ctxt):
|
||||||
|
if not ball_in_cup:
|
||||||
|
cost = 3 + 2*(0.5 * min_dist**2 + 0.5 * dist_final**2)
|
||||||
|
else:
|
||||||
|
cost = 2 * dist_to_ctxt ** 2
|
||||||
|
print('Context Distance:', dist_to_ctxt)
|
||||||
|
return cost
|
||||||
|
|
||||||
def check_ball_in_cup(self, sim, ball_collision_id):
|
def check_ball_in_cup(self, sim, ball_collision_id):
|
||||||
cup_base_collision_id = sim.model._geom_name2id["cup_base_contact"]
|
cup_base_collision_id = sim.model._geom_name2id["cup_base_contact"]
|
||||||
|
@ -88,12 +88,12 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
|||||||
self._steps += 1
|
self._steps += 1
|
||||||
else:
|
else:
|
||||||
reward = -1000
|
reward = -1000
|
||||||
|
success = False
|
||||||
done = True
|
done = True
|
||||||
return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl,
|
return ob, reward, done, dict(reward_dist=reward_dist,
|
||||||
velocity=angular_vel, # reward_balance=reward_balance,
|
reward_ctrl=reward_ctrl,
|
||||||
# end_effector=self.get_body_com("fingertip").copy(),
|
velocity=angular_vel,
|
||||||
goal=self.goal if hasattr(self, "goal") else None,
|
traj=self._q_pos, is_success=success,
|
||||||
traj=self._q_pos,
|
|
||||||
is_collided=crash or joint_cons_viol)
|
is_collided=crash or joint_cons_viol)
|
||||||
|
|
||||||
def check_traj_in_joint_limits(self):
|
def check_traj_in_joint_limits(self):
|
||||||
|
@ -3,7 +3,39 @@ from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv
|
|||||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv as ALRBallInACupEnvSimple
|
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv as ALRBallInACupEnvSimple
|
||||||
|
|
||||||
|
|
||||||
# TODO: add make_env for standard biac
|
def make_env(rank, seed=0):
|
||||||
|
"""
|
||||||
|
Utility function for multiprocessed env.
|
||||||
|
|
||||||
|
:param env_id: (str) the environment ID
|
||||||
|
:param num_env: (int) the number of environments you wish to have in subprocesses
|
||||||
|
:param seed: (int) the initial seed for RNG
|
||||||
|
:param rank: (int) index of the subprocess
|
||||||
|
:returns a function that generates an environment
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _init():
|
||||||
|
env = ALRBallInACupEnv()
|
||||||
|
|
||||||
|
env = DetPMPEnvWrapper(env,
|
||||||
|
num_dof=7,
|
||||||
|
num_basis=5,
|
||||||
|
width=0.005,
|
||||||
|
policy_type="motor",
|
||||||
|
start_pos=env.start_pos,
|
||||||
|
duration=3.5,
|
||||||
|
post_traj_time=4.5,
|
||||||
|
dt=env.dt,
|
||||||
|
weights_scale=0.1,
|
||||||
|
zero_start=True,
|
||||||
|
zero_goal=False
|
||||||
|
)
|
||||||
|
|
||||||
|
env.seed(seed + rank)
|
||||||
|
return env
|
||||||
|
|
||||||
|
return _init
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def make_simple_env(rank, seed=0):
|
def make_simple_env(rank, seed=0):
|
||||||
@ -30,7 +62,8 @@ def make_simple_env(rank, seed=0):
|
|||||||
post_traj_time=4.5,
|
post_traj_time=4.5,
|
||||||
dt=env.dt,
|
dt=env.dt,
|
||||||
weights_scale=0.1,
|
weights_scale=0.1,
|
||||||
zero_centered=True
|
zero_start=True,
|
||||||
|
zero_goal=True
|
||||||
)
|
)
|
||||||
|
|
||||||
env.seed(seed + rank)
|
env.seed(seed + rank)
|
||||||
|
@ -16,13 +16,15 @@ class DetPMPEnvWrapper(gym.Wrapper):
|
|||||||
post_traj_time=0.,
|
post_traj_time=0.,
|
||||||
policy_type=None,
|
policy_type=None,
|
||||||
weights_scale=1,
|
weights_scale=1,
|
||||||
zero_centered=False):
|
zero_start=False,
|
||||||
|
zero_goal=False,
|
||||||
|
):
|
||||||
super(DetPMPEnvWrapper, self).__init__(env)
|
super(DetPMPEnvWrapper, self).__init__(env)
|
||||||
self.num_dof = num_dof
|
self.num_dof = num_dof
|
||||||
self.num_basis = num_basis
|
self.num_basis = num_basis
|
||||||
self.dim = num_dof * num_basis
|
self.dim = num_dof * num_basis
|
||||||
self.pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=0.01,
|
self.pmp = det_promp.DeterministicProMP(n_basis=num_basis, n_dof=num_dof, width=width, off=0.01,
|
||||||
zero_centered=zero_centered)
|
zero_start=zero_start, zero_goal=zero_goal)
|
||||||
weights = np.zeros(shape=(num_basis, num_dof))
|
weights = np.zeros(shape=(num_basis, num_dof))
|
||||||
self.pmp.set_weights(duration, weights)
|
self.pmp.set_weights(duration, weights)
|
||||||
self.weights_scale = weights_scale
|
self.weights_scale = weights_scale
|
||||||
@ -32,7 +34,7 @@ class DetPMPEnvWrapper(gym.Wrapper):
|
|||||||
self.post_traj_steps = int(post_traj_time / dt)
|
self.post_traj_steps = int(post_traj_time / dt)
|
||||||
|
|
||||||
self.start_pos = start_pos
|
self.start_pos = start_pos
|
||||||
self.zero_centered = zero_centered
|
self.zero_centered = zero_start
|
||||||
|
|
||||||
policy_class = get_policy_class(policy_type)
|
policy_class = get_policy_class(policy_type)
|
||||||
self.policy = policy_class(env)
|
self.policy = policy_class(env)
|
||||||
|
Loading…
Reference in New Issue
Block a user