updates
This commit is contained in:
parent
e482fc09f0
commit
db001c411f
@ -71,6 +71,24 @@ register(
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id='ALRBallInACupSimple-v0',
|
||||||
|
entry_point='alr_envs.mujoco:ALRBallInACupEnv',
|
||||||
|
max_episode_steps=4000,
|
||||||
|
kwargs={
|
||||||
|
"reward_type": "simple"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
register(
|
||||||
|
id='ALRBallInACupGoal-v0',
|
||||||
|
entry_point='alr_envs.mujoco:ALRBallInACupEnv',
|
||||||
|
max_episode_steps=4000,
|
||||||
|
kwargs={
|
||||||
|
"reward_type": "contextual_goal"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Classic control
|
# Classic control
|
||||||
|
|
||||||
register(
|
register(
|
||||||
@ -180,18 +198,36 @@ register(
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# register(
|
||||||
|
# id='HoleReacherDetPMP-v0',
|
||||||
|
# entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp',
|
||||||
|
# # max_episode_steps=1,
|
||||||
|
# # TODO: add mp kwargs
|
||||||
|
# )
|
||||||
|
|
||||||
register(
|
register(
|
||||||
id='HoleReacherDetPMP-v0',
|
id='ALRBallInACupSimpleDMP-v0',
|
||||||
entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp',
|
entry_point='alr_envs.utils.make_env_helpers:make_dmp_env',
|
||||||
# max_episode_steps=1,
|
kwargs={
|
||||||
# TODO: add mp kwargs
|
"name": "alr_envs:ALRBallInACupSimple-v0",
|
||||||
|
"num_dof": 3,
|
||||||
|
"num_basis": 5,
|
||||||
|
"duration": 3.5,
|
||||||
|
"post_traj_time": 4.5,
|
||||||
|
"learn_goal": False,
|
||||||
|
"alpha_phase": 3,
|
||||||
|
"bandwidth_factor": 2.5,
|
||||||
|
"policy_type": "motor",
|
||||||
|
"weights_scale": 100,
|
||||||
|
"return_to_start": True
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
register(
|
register(
|
||||||
id='BiacSimpleDMP-v0',
|
id='ALRBallInACupGoalDMP-v0',
|
||||||
entry_point='alr_envs.utils.make_env_helpers:make_dmp_env',
|
entry_point='alr_envs.utils.make_env_helpers:make_dmp_env',
|
||||||
kwargs={
|
kwargs={
|
||||||
"name": "alr_envs:HoleReacher-v0",
|
"name": "alr_envs:ALRBallInACupGoal-v0",
|
||||||
"num_dof": 5,
|
"num_dof": 5,
|
||||||
"num_basis": 5,
|
"num_basis": 5,
|
||||||
"duration": 2,
|
"duration": 2,
|
||||||
|
@ -1,2 +1,3 @@
|
|||||||
from alr_envs.mujoco.reacher.alr_reacher import ALRReacherEnv
|
from alr_envs.mujoco.reacher.alr_reacher import ALRReacherEnv
|
||||||
from alr_envs.mujoco.balancing import BalancingEnv
|
from alr_envs.mujoco.balancing import BalancingEnv
|
||||||
|
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv
|
||||||
|
@ -67,6 +67,9 @@ class AlrMujocoEnv(gym.Env):
|
|||||||
self.init_qpos = self.sim.data.qpos.ravel().copy()
|
self.init_qpos = self.sim.data.qpos.ravel().copy()
|
||||||
self.init_qvel = self.sim.data.qvel.ravel().copy()
|
self.init_qvel = self.sim.data.qvel.ravel().copy()
|
||||||
|
|
||||||
|
self._start_pos = None
|
||||||
|
self._start_vel = None
|
||||||
|
|
||||||
self._set_action_space()
|
self._set_action_space()
|
||||||
|
|
||||||
# action = self.action_space.sample()
|
# action = self.action_space.sample()
|
||||||
@ -93,6 +96,20 @@ class AlrMujocoEnv(gym.Env):
|
|||||||
"""
|
"""
|
||||||
return self.sim.data.qvel
|
return self.sim.data.qvel
|
||||||
|
|
||||||
|
@property
|
||||||
|
def start_pos(self):
|
||||||
|
"""
|
||||||
|
Start position of the agent, for example joint angles of a Panda robot. Necessary for MP wrapped envs.
|
||||||
|
"""
|
||||||
|
return self._start_pos
|
||||||
|
|
||||||
|
@property
|
||||||
|
def start_vel(self):
|
||||||
|
"""
|
||||||
|
Start velocity of the agent. Necessary for MP wrapped envs.
|
||||||
|
"""
|
||||||
|
return self._start_vel
|
||||||
|
|
||||||
def extend_des_pos(self, des_pos):
|
def extend_des_pos(self, des_pos):
|
||||||
"""
|
"""
|
||||||
In a simplified environment, the actions may only control a subset of all the joints.
|
In a simplified environment, the actions may only control a subset of all the joints.
|
||||||
|
@ -1 +1 @@
|
|||||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv
|
|
||||||
|
@ -5,15 +5,12 @@ from alr_envs.mujoco import alr_mujoco_env
|
|||||||
|
|
||||||
|
|
||||||
class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||||
def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None):
|
def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_type: str = None, context: np.ndarray = None):
|
||||||
self._steps = 0
|
self._steps = 0
|
||||||
|
|
||||||
self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
|
self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
|
||||||
"biac_base" + ".xml")
|
"biac_base" + ".xml")
|
||||||
|
|
||||||
self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
|
|
||||||
self.start_vel = np.zeros(7)
|
|
||||||
|
|
||||||
self._q_pos = []
|
self._q_pos = []
|
||||||
self._q_vel = []
|
self._q_vel = []
|
||||||
# self.weight_matrix_scale = 50
|
# self.weight_matrix_scale = 50
|
||||||
@ -31,13 +28,21 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
|||||||
self.xml_path,
|
self.xml_path,
|
||||||
apply_gravity_comp=apply_gravity_comp,
|
apply_gravity_comp=apply_gravity_comp,
|
||||||
n_substeps=n_substeps)
|
n_substeps=n_substeps)
|
||||||
|
self._start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
|
||||||
|
self._start_vel = np.zeros(7)
|
||||||
|
|
||||||
self.sim_time = 8 # seconds
|
self.sim_time = 8 # seconds
|
||||||
self.sim_steps = int(self.sim_time / self.dt)
|
self.sim_steps = int(self.sim_time / self.dt)
|
||||||
if reward_function is None:
|
if reward_type == "simple":
|
||||||
|
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
|
||||||
|
reward_function = BallInACupReward
|
||||||
|
elif reward_type == "contextual_goal":
|
||||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
|
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
|
||||||
reward_function = BallInACupReward
|
reward_function = BallInACupReward
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown reward type")
|
||||||
self.reward_function = reward_function(self.sim_steps)
|
self.reward_function = reward_function(self.sim_steps)
|
||||||
|
self.configure(context)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def current_pos(self):
|
def current_pos(self):
|
||||||
@ -97,6 +102,7 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
|||||||
def check_traj_in_joint_limits(self):
|
def check_traj_in_joint_limits(self):
|
||||||
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
|
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
|
||||||
|
|
||||||
|
# TODO: extend observation space
|
||||||
def _get_obs(self):
|
def _get_obs(self):
|
||||||
theta = self.sim.data.qpos.flat[:7]
|
theta = self.sim.data.qpos.flat[:7]
|
||||||
return np.concatenate([
|
return np.concatenate([
|
||||||
@ -106,6 +112,20 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
|||||||
[self._steps],
|
[self._steps],
|
||||||
])
|
])
|
||||||
|
|
||||||
|
# These functions are for the task with 3 joint actuations
|
||||||
|
def extend_des_pos(self, des_pos):
|
||||||
|
des_pos_full = self.start_pos.copy()
|
||||||
|
des_pos_full[1] = des_pos[0]
|
||||||
|
des_pos_full[3] = des_pos[1]
|
||||||
|
des_pos_full[5] = des_pos[2]
|
||||||
|
return des_pos_full
|
||||||
|
|
||||||
|
def extend_des_vel(self, des_vel):
|
||||||
|
des_vel_full = self.start_vel.copy()
|
||||||
|
des_vel_full[1] = des_vel[0]
|
||||||
|
des_vel_full[3] = des_vel[1]
|
||||||
|
des_vel_full[5] = des_vel[2]
|
||||||
|
return des_vel_full
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -40,9 +40,12 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
|||||||
self.ball_in_cup = False
|
self.ball_in_cup = False
|
||||||
self.ball_above_threshold = False
|
self.ball_above_threshold = False
|
||||||
self.dist_ctxt = 3
|
self.dist_ctxt = 3
|
||||||
|
self.action_costs = []
|
||||||
|
self.cup_angles = []
|
||||||
|
|
||||||
def compute_reward(self, action, sim, step):
|
def compute_reward(self, action, sim, step):
|
||||||
action_cost = np.sum(np.square(action))
|
action_cost = np.sum(np.square(action))
|
||||||
|
self.action_costs.append(action_cost)
|
||||||
|
|
||||||
stop_sim = False
|
stop_sim = False
|
||||||
success = False
|
success = False
|
||||||
@ -54,7 +57,7 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
|||||||
self.collision_ids = [sim.model._geom_name2id[name] for name in self.collision_objects]
|
self.collision_ids = [sim.model._geom_name2id[name] for name in self.collision_objects]
|
||||||
|
|
||||||
if self.check_collision(sim):
|
if self.check_collision(sim):
|
||||||
reward = - 1e-4 * action_cost - 1000
|
reward = - 1e-3 * action_cost - 1000
|
||||||
stop_sim = True
|
stop_sim = True
|
||||||
return reward, success, stop_sim
|
return reward, success, stop_sim
|
||||||
|
|
||||||
@ -65,8 +68,11 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
|||||||
self.dists.append(np.linalg.norm(goal_pos - ball_pos))
|
self.dists.append(np.linalg.norm(goal_pos - ball_pos))
|
||||||
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
|
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
|
||||||
self.dists_ctxt.append(np.linalg.norm(ball_pos - self.context))
|
self.dists_ctxt.append(np.linalg.norm(ball_pos - self.context))
|
||||||
self.ball_traj[step, :] = ball_pos
|
self.ball_traj[step, :] = np.copy(ball_pos)
|
||||||
self.cup_traj[step, :] = goal_pos
|
self.cup_traj[step, :] = np.copy(goal_pos) # ?
|
||||||
|
cup_quat = np.copy(sim.data.body_xquat[sim.model._body_name2id["cup"]])
|
||||||
|
self.cup_angles.append(np.arctan2(2 * (cup_quat[0] * cup_quat[1] + cup_quat[2] * cup_quat[3]),
|
||||||
|
1 - 2 * (cup_quat[1] ** 2 + cup_quat[2] ** 2)))
|
||||||
|
|
||||||
# Determine the first time when ball is in cup
|
# Determine the first time when ball is in cup
|
||||||
if not self.ball_in_cup:
|
if not self.ball_in_cup:
|
||||||
@ -77,25 +83,29 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
|||||||
self.dist_ctxt = dist_to_ctxt
|
self.dist_ctxt = dist_to_ctxt
|
||||||
|
|
||||||
if step == self.sim_time - 1:
|
if step == self.sim_time - 1:
|
||||||
|
t_min_dist = np.argmin(self.dists)
|
||||||
|
angle_min_dist = self.cup_angles[t_min_dist]
|
||||||
|
cost_angle = (angle_min_dist - np.pi / 2) ** 2
|
||||||
|
|
||||||
min_dist = np.min(self.dists)
|
min_dist = np.min(self.dists)
|
||||||
dist_final = self.dists_final[-1]
|
dist_final = self.dists_final[-1]
|
||||||
# dist_ctxt = self.dists_ctxt[-1]
|
# dist_ctxt = self.dists_ctxt[-1]
|
||||||
|
|
||||||
# max distance between ball and cup and cup height at that time
|
# # max distance between ball and cup and cup height at that time
|
||||||
ball_to_cup_diff = self.ball_traj[:, 2] - self.cup_traj[:, 2]
|
# ball_to_cup_diff = self.ball_traj[:, 2] - self.cup_traj[:, 2]
|
||||||
t_max_diff = np.argmax(ball_to_cup_diff)
|
# t_max_diff = np.argmax(ball_to_cup_diff)
|
||||||
t_max_ball_height = np.argmax(self.ball_traj[:, 2])
|
# t_max_ball_height = np.argmax(self.ball_traj[:, 2])
|
||||||
max_ball_height = np.max(self.ball_traj[:, 2])
|
# max_ball_height = np.max(self.ball_traj[:, 2])
|
||||||
|
|
||||||
# cost = self._get_stage_wise_cost(ball_in_cup, min_dist, dist_final, dist_ctxt)
|
# cost = self._get_stage_wise_cost(ball_in_cup, min_dist, dist_final, dist_ctxt)
|
||||||
cost = 0.3 * min_dist + 0.3 * dist_final + 0.3 * np.minimum(self.dist_ctxt, 3)
|
cost = 0.5 * min_dist + 0.5 * dist_final + 0.3 * np.minimum(self.dist_ctxt, 3) + 0.01 * cost_angle
|
||||||
reward = np.exp(-1 * cost) - 1e-4 * action_cost
|
reward = np.exp(-2 * cost) - 1e-3 * action_cost
|
||||||
if max_ball_height < self.context[2] or ball_to_cup_diff[t_max_ball_height] < 0:
|
# if max_ball_height < self.context[2] or ball_to_cup_diff[t_max_ball_height] < 0:
|
||||||
reward -= 1
|
# reward -= 1
|
||||||
|
|
||||||
success = dist_final < 0.05 and self.dist_ctxt < 0.05
|
success = dist_final < 0.05 and self.dist_ctxt < 0.05
|
||||||
else:
|
else:
|
||||||
reward = - 1e-4 * action_cost
|
reward = - 1e-3 * action_cost
|
||||||
success = False
|
success = False
|
||||||
|
|
||||||
return reward, success, stop_sim
|
return reward, success, stop_sim
|
||||||
|
@ -1,151 +0,0 @@
|
|||||||
from alr_envs.mujoco import alr_mujoco_env
|
|
||||||
from gym import utils
|
|
||||||
import os
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
|
||||||
def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None):
|
|
||||||
self._steps = 0
|
|
||||||
|
|
||||||
self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
|
|
||||||
"biac_base" + ".xml")
|
|
||||||
|
|
||||||
self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
|
|
||||||
self.start_vel = np.zeros(7)
|
|
||||||
|
|
||||||
self._q_pos = []
|
|
||||||
self._q_vel = []
|
|
||||||
# self.weight_matrix_scale = 50
|
|
||||||
self.max_ctrl = np.array([150., 125., 40., 60., 5., 5., 2.])
|
|
||||||
self.p_gains = 1 / self.max_ctrl * np.array([200, 300, 100, 100, 10, 10, 2.5])
|
|
||||||
self.d_gains = 1 / self.max_ctrl * np.array([7, 15, 5, 2.5, 0.3, 0.3, 0.05])
|
|
||||||
|
|
||||||
self.j_min = np.array([-2.6, -1.985, -2.8, -0.9, -4.55, -1.5707, -2.7])
|
|
||||||
self.j_max = np.array([2.6, 1.985, 2.8, 3.14159, 1.25, 1.5707, 2.7])
|
|
||||||
|
|
||||||
self.context = None
|
|
||||||
|
|
||||||
utils.EzPickle.__init__(self)
|
|
||||||
alr_mujoco_env.AlrMujocoEnv.__init__(self,
|
|
||||||
self.xml_path,
|
|
||||||
apply_gravity_comp=apply_gravity_comp,
|
|
||||||
n_substeps=n_substeps)
|
|
||||||
|
|
||||||
self.sim_time = 8 # seconds
|
|
||||||
self.sim_steps = int(self.sim_time / self.dt)
|
|
||||||
if reward_function is None:
|
|
||||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
|
|
||||||
reward_function = BallInACupReward
|
|
||||||
self.reward_function = reward_function(self.sim_steps)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def current_pos(self):
|
|
||||||
return self.sim.data.qpos[0:7].copy()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def current_vel(self):
|
|
||||||
return self.sim.data.qvel[0:7].copy()
|
|
||||||
|
|
||||||
def configure(self, context):
|
|
||||||
self.context = context
|
|
||||||
self.reward_function.reset(context)
|
|
||||||
|
|
||||||
def reset_model(self):
|
|
||||||
init_pos_all = self.init_qpos.copy()
|
|
||||||
init_pos_robot = self.start_pos
|
|
||||||
init_vel = np.zeros_like(init_pos_all)
|
|
||||||
ball_id = self.sim.model._body_name2id["ball"]
|
|
||||||
goal_final_id = self.sim.model._site_name2id["cup_goal_final"]
|
|
||||||
|
|
||||||
self._steps = 0
|
|
||||||
self._q_pos = []
|
|
||||||
self._q_vel = []
|
|
||||||
|
|
||||||
start_pos = init_pos_all
|
|
||||||
start_pos[0:7] = init_pos_robot
|
|
||||||
|
|
||||||
self.set_state(start_pos, init_vel)
|
|
||||||
|
|
||||||
return self._get_obs()
|
|
||||||
|
|
||||||
def step(self, a):
|
|
||||||
reward_dist = 0.0
|
|
||||||
angular_vel = 0.0
|
|
||||||
reward_ctrl = - np.square(a).sum()
|
|
||||||
|
|
||||||
crash = self.do_simulation(a)
|
|
||||||
joint_cons_viol = self.check_traj_in_joint_limits()
|
|
||||||
|
|
||||||
self._q_pos.append(self.sim.data.qpos[0:7].ravel().copy())
|
|
||||||
self._q_vel.append(self.sim.data.qvel[0:7].ravel().copy())
|
|
||||||
|
|
||||||
ob = self._get_obs()
|
|
||||||
|
|
||||||
if not crash and not joint_cons_viol:
|
|
||||||
reward, success, collision = self.reward_function.compute_reward(a, self.sim, self._steps)
|
|
||||||
done = success or self._steps == self.sim_steps - 1 or collision
|
|
||||||
self._steps += 1
|
|
||||||
else:
|
|
||||||
reward = -1000
|
|
||||||
success = False
|
|
||||||
done = True
|
|
||||||
return ob, reward, done, dict(reward_dist=reward_dist,
|
|
||||||
reward_ctrl=reward_ctrl,
|
|
||||||
velocity=angular_vel,
|
|
||||||
traj=self._q_pos, is_success=success,
|
|
||||||
is_collided=crash or joint_cons_viol)
|
|
||||||
|
|
||||||
def check_traj_in_joint_limits(self):
|
|
||||||
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
|
|
||||||
|
|
||||||
# TODO: extend observation space
|
|
||||||
def _get_obs(self):
|
|
||||||
theta = self.sim.data.qpos.flat[:7]
|
|
||||||
return np.concatenate([
|
|
||||||
np.cos(theta),
|
|
||||||
np.sin(theta),
|
|
||||||
# self.get_body_com("target"), # only return target to make problem harder
|
|
||||||
[self._steps],
|
|
||||||
])
|
|
||||||
|
|
||||||
def extend_des_pos(self, des_pos):
|
|
||||||
des_pos_full = self.start_pos.copy()
|
|
||||||
des_pos_full[1] = des_pos[0]
|
|
||||||
des_pos_full[3] = des_pos[1]
|
|
||||||
des_pos_full[5] = des_pos[2]
|
|
||||||
return des_pos_full
|
|
||||||
|
|
||||||
def extend_des_vel(self, des_vel):
|
|
||||||
des_vel_full = self.start_vel.copy()
|
|
||||||
des_vel_full[1] = des_vel[0]
|
|
||||||
des_vel_full[3] = des_vel[1]
|
|
||||||
des_vel_full[5] = des_vel[2]
|
|
||||||
return des_vel_full
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
|
|
||||||
|
|
||||||
env = ALRBallInACupEnv(reward_function=BallInACupReward)
|
|
||||||
env.configure(None)
|
|
||||||
env.reset()
|
|
||||||
env.render()
|
|
||||||
for i in range(4000):
|
|
||||||
# objective.load_result("/tmp/cma")
|
|
||||||
# test with random actions
|
|
||||||
# ac = 0.1 * env.action_space.sample()
|
|
||||||
# ac = -np.array([i, i, i]) / 10000 + np.array([env.start_pos[1], env.start_pos[3], env.start_pos[5]])
|
|
||||||
# ac = np.array([0., -10, 0, -1, 0, 1, 0])
|
|
||||||
ac = np.array([0.,0., 0, 0, 0, 0, 0])
|
|
||||||
# ac[0] += np.pi/2
|
|
||||||
obs, rew, d, info = env.step(ac)
|
|
||||||
env.render()
|
|
||||||
|
|
||||||
print(rew)
|
|
||||||
|
|
||||||
if d:
|
|
||||||
break
|
|
||||||
|
|
||||||
env.close()
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
|||||||
from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper
|
from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper
|
||||||
from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper
|
from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper
|
||||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv
|
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv
|
||||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv as ALRBallInACupEnvSimple
|
|
||||||
|
|
||||||
|
|
||||||
def make_contextual_env(rank, seed=0):
|
def make_contextual_env(rank, seed=0):
|
||||||
@ -16,7 +15,7 @@ def make_contextual_env(rank, seed=0):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def _init():
|
def _init():
|
||||||
env = ALRBallInACupEnv()
|
env = ALRBallInACupEnv(reward_type="contextual_goal")
|
||||||
|
|
||||||
env = DetPMPWrapper(env,
|
env = DetPMPWrapper(env,
|
||||||
num_dof=7,
|
num_dof=7,
|
||||||
@ -50,7 +49,7 @@ def make_env(rank, seed=0):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def _init():
|
def _init():
|
||||||
env = ALRBallInACupEnvSimple()
|
env = ALRBallInACupEnv(reward_type="simple")
|
||||||
|
|
||||||
env = DetPMPWrapper(env,
|
env = DetPMPWrapper(env,
|
||||||
num_dof=7,
|
num_dof=7,
|
||||||
@ -84,7 +83,7 @@ def make_simple_env(rank, seed=0):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def _init():
|
def _init():
|
||||||
env = ALRBallInACupEnvSimple()
|
env = ALRBallInACupEnv(reward_type="simple")
|
||||||
|
|
||||||
env = DetPMPWrapper(env,
|
env = DetPMPWrapper(env,
|
||||||
num_dof=3,
|
num_dof=3,
|
||||||
@ -119,7 +118,7 @@ def make_simple_dmp_env(rank, seed=0):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def _init():
|
def _init():
|
||||||
_env = ALRBallInACupEnvSimple()
|
_env = ALRBallInACupEnv(reward_type="simple")
|
||||||
|
|
||||||
_env = DmpWrapper(_env,
|
_env = DmpWrapper(_env,
|
||||||
num_dof=3,
|
num_dof=3,
|
||||||
|
82
alr_envs/utils/mp_env_async_sampler.py
Normal file
82
alr_envs/utils/mp_env_async_sampler.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
import gym
|
||||||
|
from gym.vector.async_vector_env import AsyncVectorEnv
|
||||||
|
import numpy as np
|
||||||
|
from _collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
|
def make_env(env_id, rank, seed=0):
|
||||||
|
env = gym.make(env_id)
|
||||||
|
env.seed(seed + rank)
|
||||||
|
return lambda: env
|
||||||
|
|
||||||
|
|
||||||
|
def split_array(ary, size):
|
||||||
|
n_samples = len(ary)
|
||||||
|
if n_samples < size:
|
||||||
|
tmp = np.zeros((size, ary.shape[1]))
|
||||||
|
tmp[0:n_samples] = ary
|
||||||
|
return [tmp]
|
||||||
|
elif n_samples == size:
|
||||||
|
return [ary]
|
||||||
|
else:
|
||||||
|
repeat = int(np.ceil(n_samples / size))
|
||||||
|
split = [k * size for k in range(1, repeat)]
|
||||||
|
sub_arys = np.split(ary, split)
|
||||||
|
|
||||||
|
if n_samples % repeat != 0:
|
||||||
|
tmp = np.zeros_like(sub_arys[0])
|
||||||
|
last = sub_arys[-1]
|
||||||
|
tmp[0: len(last)] = last
|
||||||
|
sub_arys[-1] = tmp
|
||||||
|
|
||||||
|
return sub_arys
|
||||||
|
|
||||||
|
|
||||||
|
def _flatten_list(l):
|
||||||
|
assert isinstance(l, (list, tuple))
|
||||||
|
assert len(l) > 0
|
||||||
|
assert all([len(l_) > 0 for l_ in l])
|
||||||
|
|
||||||
|
return [l__ for l_ in l for l__ in l_]
|
||||||
|
|
||||||
|
|
||||||
|
class AlrMpEnvSampler:
|
||||||
|
"""
|
||||||
|
An asynchronous sampler for MPWrapper environments. A sampler object can be called with a set of parameters and
|
||||||
|
returns the corresponding final obs, rewards, dones and info dicts.
|
||||||
|
"""
|
||||||
|
def __init__(self, env_id, num_envs, seed=0):
|
||||||
|
self.num_envs = num_envs
|
||||||
|
self.env = AsyncVectorEnv([make_env(env_id, seed, i) for i in range(num_envs)])
|
||||||
|
|
||||||
|
def __call__(self, params):
|
||||||
|
params = np.atleast_2d(params)
|
||||||
|
n_samples = params.shape[0]
|
||||||
|
split_params = split_array(params, self.num_envs)
|
||||||
|
|
||||||
|
vals = defaultdict(list)
|
||||||
|
for p in split_params:
|
||||||
|
obs, reward, done, info = self.env.step(p)
|
||||||
|
vals['obs'].append(obs)
|
||||||
|
vals['reward'].append(reward)
|
||||||
|
vals['done'].append(done)
|
||||||
|
vals['info'].append(info)
|
||||||
|
|
||||||
|
# do not return values above threshold
|
||||||
|
return np.vstack(vals['obs'])[:n_samples], np.hstack(vals['reward'])[:n_samples],\
|
||||||
|
_flatten_list(vals['done'])[:n_samples], _flatten_list(vals['info'])[:n_samples]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
env_name = "alr_envs:HoleReacherDMP-v0"
|
||||||
|
n_cpu = 8
|
||||||
|
dim = 30
|
||||||
|
n_samples = 20
|
||||||
|
|
||||||
|
sampler = AlrMpEnvSampler(env_name, num_envs=n_cpu)
|
||||||
|
|
||||||
|
thetas = np.random.randn(n_samples, dim) # usually form a search distribution
|
||||||
|
|
||||||
|
_, rewards, __, ___ = sampler(thetas)
|
||||||
|
|
||||||
|
print(rewards)
|
@ -11,8 +11,9 @@ class DmpWrapper(MPWrapper):
|
|||||||
|
|
||||||
def __init__(self, env: gym.Env, num_dof: int, num_basis: int, start_pos: np.ndarray = None,
|
def __init__(self, env: gym.Env, num_dof: int, num_basis: int, start_pos: np.ndarray = None,
|
||||||
final_pos: np.ndarray = None, duration: int = 1, alpha_phase: float = 2., dt: float = None,
|
final_pos: np.ndarray = None, duration: int = 1, alpha_phase: float = 2., dt: float = None,
|
||||||
learn_goal: bool = False, post_traj_time: float = 0., policy_type: str = None,
|
learn_goal: bool = False, return_to_start: bool = False, post_traj_time: float = 0.,
|
||||||
weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3.):
|
weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3.,
|
||||||
|
policy_type: str = None):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
This Wrapper generates a trajectory based on a DMP and will only return episodic performances.
|
This Wrapper generates a trajectory based on a DMP and will only return episodic performances.
|
||||||
@ -34,8 +35,13 @@ class DmpWrapper(MPWrapper):
|
|||||||
self.learn_goal = learn_goal
|
self.learn_goal = learn_goal
|
||||||
dt = env.dt if hasattr(env, "dt") else dt
|
dt = env.dt if hasattr(env, "dt") else dt
|
||||||
assert dt is not None
|
assert dt is not None
|
||||||
start_pos = env.start_pos if hasattr(env, "start_pos") else start_pos
|
start_pos = start_pos if start_pos is not None else env.start_pos if hasattr(env, "start_pos") else None
|
||||||
assert start_pos is not None
|
assert start_pos is not None
|
||||||
|
if learn_goal:
|
||||||
|
final_pos = np.zeros_like(start_pos) # arbitrary, will be learned
|
||||||
|
else:
|
||||||
|
final_pos = final_pos if final_pos is not None else start_pos if return_to_start else None
|
||||||
|
assert final_pos is not None
|
||||||
self.t = np.linspace(0, duration, int(duration / dt))
|
self.t = np.linspace(0, duration, int(duration / dt))
|
||||||
self.goal_scale = goal_scale
|
self.goal_scale = goal_scale
|
||||||
|
|
||||||
|
@ -46,8 +46,9 @@ class MPWrapper(gym.Wrapper, ABC):
|
|||||||
rewards = []
|
rewards = []
|
||||||
dones = []
|
dones = []
|
||||||
infos = []
|
infos = []
|
||||||
for p, c in zip(params, contexts):
|
# for p, c in zip(params, contexts):
|
||||||
self.configure(c)
|
for p in params:
|
||||||
|
# self.configure(c)
|
||||||
ob, reward, done, info = self.step(p)
|
ob, reward, done, info = self.step(p)
|
||||||
obs.append(ob)
|
obs.append(ob)
|
||||||
rewards.append(reward)
|
rewards.append(reward)
|
||||||
|
Loading…
Reference in New Issue
Block a user