This commit is contained in:
Maximilian Huettenrauch 2021-04-23 11:37:42 +02:00
parent e482fc09f0
commit db001c411f
11 changed files with 207 additions and 186 deletions

View File

@ -71,6 +71,24 @@ register(
} }
) )
register(
id='ALRBallInACupSimple-v0',
entry_point='alr_envs.mujoco:ALRBallInACupEnv',
max_episode_steps=4000,
kwargs={
"reward_type": "simple"
}
)
register(
id='ALRBallInACupGoal-v0',
entry_point='alr_envs.mujoco:ALRBallInACupEnv',
max_episode_steps=4000,
kwargs={
"reward_type": "contextual_goal"
}
)
# Classic control # Classic control
register( register(
@ -180,18 +198,36 @@ register(
} }
) )
# register(
# id='HoleReacherDetPMP-v0',
# entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp',
# # max_episode_steps=1,
# # TODO: add mp kwargs
# )
register( register(
id='HoleReacherDetPMP-v0', id='ALRBallInACupSimpleDMP-v0',
entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp', entry_point='alr_envs.utils.make_env_helpers:make_dmp_env',
# max_episode_steps=1, kwargs={
# TODO: add mp kwargs "name": "alr_envs:ALRBallInACupSimple-v0",
"num_dof": 3,
"num_basis": 5,
"duration": 3.5,
"post_traj_time": 4.5,
"learn_goal": False,
"alpha_phase": 3,
"bandwidth_factor": 2.5,
"policy_type": "motor",
"weights_scale": 100,
"return_to_start": True
}
) )
register( register(
id='BiacSimpleDMP-v0', id='ALRBallInACupGoalDMP-v0',
entry_point='alr_envs.utils.make_env_helpers:make_dmp_env', entry_point='alr_envs.utils.make_env_helpers:make_dmp_env',
kwargs={ kwargs={
"name": "alr_envs:HoleReacher-v0", "name": "alr_envs:ALRBallInACupGoal-v0",
"num_dof": 5, "num_dof": 5,
"num_basis": 5, "num_basis": 5,
"duration": 2, "duration": 2,

View File

@ -1,2 +1,3 @@
from alr_envs.mujoco.reacher.alr_reacher import ALRReacherEnv from alr_envs.mujoco.reacher.alr_reacher import ALRReacherEnv
from alr_envs.mujoco.balancing import BalancingEnv from alr_envs.mujoco.balancing import BalancingEnv
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv

View File

@ -67,6 +67,9 @@ class AlrMujocoEnv(gym.Env):
self.init_qpos = self.sim.data.qpos.ravel().copy() self.init_qpos = self.sim.data.qpos.ravel().copy()
self.init_qvel = self.sim.data.qvel.ravel().copy() self.init_qvel = self.sim.data.qvel.ravel().copy()
self._start_pos = None
self._start_vel = None
self._set_action_space() self._set_action_space()
# action = self.action_space.sample() # action = self.action_space.sample()
@ -93,6 +96,20 @@ class AlrMujocoEnv(gym.Env):
""" """
return self.sim.data.qvel return self.sim.data.qvel
@property
def start_pos(self):
"""
Start position of the agent, for example joint angles of a Panda robot. Necessary for MP wrapped envs.
"""
return self._start_pos
@property
def start_vel(self):
"""
Start velocity of the agent. Necessary for MP wrapped envs.
"""
return self._start_vel
def extend_des_pos(self, des_pos): def extend_des_pos(self, des_pos):
""" """
In a simplified environment, the actions may only control a subset of all the joints. In a simplified environment, the actions may only control a subset of all the joints.

View File

@ -1 +1 @@
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv

View File

@ -5,15 +5,12 @@ from alr_envs.mujoco import alr_mujoco_env
class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle): class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None): def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_type: str = None, context: np.ndarray = None):
self._steps = 0 self._steps = 0
self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
"biac_base" + ".xml") "biac_base" + ".xml")
self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
self.start_vel = np.zeros(7)
self._q_pos = [] self._q_pos = []
self._q_vel = [] self._q_vel = []
# self.weight_matrix_scale = 50 # self.weight_matrix_scale = 50
@ -31,13 +28,21 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
self.xml_path, self.xml_path,
apply_gravity_comp=apply_gravity_comp, apply_gravity_comp=apply_gravity_comp,
n_substeps=n_substeps) n_substeps=n_substeps)
self._start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
self._start_vel = np.zeros(7)
self.sim_time = 8 # seconds self.sim_time = 8 # seconds
self.sim_steps = int(self.sim_time / self.dt) self.sim_steps = int(self.sim_time / self.dt)
if reward_function is None: if reward_type == "simple":
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
reward_function = BallInACupReward
elif reward_type == "contextual_goal":
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
reward_function = BallInACupReward reward_function = BallInACupReward
else:
raise ValueError("Unknown reward type")
self.reward_function = reward_function(self.sim_steps) self.reward_function = reward_function(self.sim_steps)
self.configure(context)
@property @property
def current_pos(self): def current_pos(self):
@ -97,6 +102,7 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
def check_traj_in_joint_limits(self): def check_traj_in_joint_limits(self):
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min) return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
# TODO: extend observation space
def _get_obs(self): def _get_obs(self):
theta = self.sim.data.qpos.flat[:7] theta = self.sim.data.qpos.flat[:7]
return np.concatenate([ return np.concatenate([
@ -106,6 +112,20 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
[self._steps], [self._steps],
]) ])
# These functions are for the task with 3 joint actuations
def extend_des_pos(self, des_pos):
des_pos_full = self.start_pos.copy()
des_pos_full[1] = des_pos[0]
des_pos_full[3] = des_pos[1]
des_pos_full[5] = des_pos[2]
return des_pos_full
def extend_des_vel(self, des_vel):
des_vel_full = self.start_vel.copy()
des_vel_full[1] = des_vel[0]
des_vel_full[3] = des_vel[1]
des_vel_full[5] = des_vel[2]
return des_vel_full
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -40,9 +40,12 @@ class BallInACupReward(alr_reward_fct.AlrReward):
self.ball_in_cup = False self.ball_in_cup = False
self.ball_above_threshold = False self.ball_above_threshold = False
self.dist_ctxt = 3 self.dist_ctxt = 3
self.action_costs = []
self.cup_angles = []
def compute_reward(self, action, sim, step): def compute_reward(self, action, sim, step):
action_cost = np.sum(np.square(action)) action_cost = np.sum(np.square(action))
self.action_costs.append(action_cost)
stop_sim = False stop_sim = False
success = False success = False
@ -54,7 +57,7 @@ class BallInACupReward(alr_reward_fct.AlrReward):
self.collision_ids = [sim.model._geom_name2id[name] for name in self.collision_objects] self.collision_ids = [sim.model._geom_name2id[name] for name in self.collision_objects]
if self.check_collision(sim): if self.check_collision(sim):
reward = - 1e-4 * action_cost - 1000 reward = - 1e-3 * action_cost - 1000
stop_sim = True stop_sim = True
return reward, success, stop_sim return reward, success, stop_sim
@ -65,8 +68,11 @@ class BallInACupReward(alr_reward_fct.AlrReward):
self.dists.append(np.linalg.norm(goal_pos - ball_pos)) self.dists.append(np.linalg.norm(goal_pos - ball_pos))
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos)) self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
self.dists_ctxt.append(np.linalg.norm(ball_pos - self.context)) self.dists_ctxt.append(np.linalg.norm(ball_pos - self.context))
self.ball_traj[step, :] = ball_pos self.ball_traj[step, :] = np.copy(ball_pos)
self.cup_traj[step, :] = goal_pos self.cup_traj[step, :] = np.copy(goal_pos) # ?
cup_quat = np.copy(sim.data.body_xquat[sim.model._body_name2id["cup"]])
self.cup_angles.append(np.arctan2(2 * (cup_quat[0] * cup_quat[1] + cup_quat[2] * cup_quat[3]),
1 - 2 * (cup_quat[1] ** 2 + cup_quat[2] ** 2)))
# Determine the first time when ball is in cup # Determine the first time when ball is in cup
if not self.ball_in_cup: if not self.ball_in_cup:
@ -77,25 +83,29 @@ class BallInACupReward(alr_reward_fct.AlrReward):
self.dist_ctxt = dist_to_ctxt self.dist_ctxt = dist_to_ctxt
if step == self.sim_time - 1: if step == self.sim_time - 1:
t_min_dist = np.argmin(self.dists)
angle_min_dist = self.cup_angles[t_min_dist]
cost_angle = (angle_min_dist - np.pi / 2) ** 2
min_dist = np.min(self.dists) min_dist = np.min(self.dists)
dist_final = self.dists_final[-1] dist_final = self.dists_final[-1]
# dist_ctxt = self.dists_ctxt[-1] # dist_ctxt = self.dists_ctxt[-1]
# max distance between ball and cup and cup height at that time # # max distance between ball and cup and cup height at that time
ball_to_cup_diff = self.ball_traj[:, 2] - self.cup_traj[:, 2] # ball_to_cup_diff = self.ball_traj[:, 2] - self.cup_traj[:, 2]
t_max_diff = np.argmax(ball_to_cup_diff) # t_max_diff = np.argmax(ball_to_cup_diff)
t_max_ball_height = np.argmax(self.ball_traj[:, 2]) # t_max_ball_height = np.argmax(self.ball_traj[:, 2])
max_ball_height = np.max(self.ball_traj[:, 2]) # max_ball_height = np.max(self.ball_traj[:, 2])
# cost = self._get_stage_wise_cost(ball_in_cup, min_dist, dist_final, dist_ctxt) # cost = self._get_stage_wise_cost(ball_in_cup, min_dist, dist_final, dist_ctxt)
cost = 0.3 * min_dist + 0.3 * dist_final + 0.3 * np.minimum(self.dist_ctxt, 3) cost = 0.5 * min_dist + 0.5 * dist_final + 0.3 * np.minimum(self.dist_ctxt, 3) + 0.01 * cost_angle
reward = np.exp(-1 * cost) - 1e-4 * action_cost reward = np.exp(-2 * cost) - 1e-3 * action_cost
if max_ball_height < self.context[2] or ball_to_cup_diff[t_max_ball_height] < 0: # if max_ball_height < self.context[2] or ball_to_cup_diff[t_max_ball_height] < 0:
reward -= 1 # reward -= 1
success = dist_final < 0.05 and self.dist_ctxt < 0.05 success = dist_final < 0.05 and self.dist_ctxt < 0.05
else: else:
reward = - 1e-4 * action_cost reward = - 1e-3 * action_cost
success = False success = False
return reward, success, stop_sim return reward, success, stop_sim

View File

@ -1,151 +0,0 @@
from alr_envs.mujoco import alr_mujoco_env
from gym import utils
import os
import numpy as np
class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None):
self._steps = 0
self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
"biac_base" + ".xml")
self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
self.start_vel = np.zeros(7)
self._q_pos = []
self._q_vel = []
# self.weight_matrix_scale = 50
self.max_ctrl = np.array([150., 125., 40., 60., 5., 5., 2.])
self.p_gains = 1 / self.max_ctrl * np.array([200, 300, 100, 100, 10, 10, 2.5])
self.d_gains = 1 / self.max_ctrl * np.array([7, 15, 5, 2.5, 0.3, 0.3, 0.05])
self.j_min = np.array([-2.6, -1.985, -2.8, -0.9, -4.55, -1.5707, -2.7])
self.j_max = np.array([2.6, 1.985, 2.8, 3.14159, 1.25, 1.5707, 2.7])
self.context = None
utils.EzPickle.__init__(self)
alr_mujoco_env.AlrMujocoEnv.__init__(self,
self.xml_path,
apply_gravity_comp=apply_gravity_comp,
n_substeps=n_substeps)
self.sim_time = 8 # seconds
self.sim_steps = int(self.sim_time / self.dt)
if reward_function is None:
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
reward_function = BallInACupReward
self.reward_function = reward_function(self.sim_steps)
@property
def current_pos(self):
return self.sim.data.qpos[0:7].copy()
@property
def current_vel(self):
return self.sim.data.qvel[0:7].copy()
def configure(self, context):
self.context = context
self.reward_function.reset(context)
def reset_model(self):
init_pos_all = self.init_qpos.copy()
init_pos_robot = self.start_pos
init_vel = np.zeros_like(init_pos_all)
ball_id = self.sim.model._body_name2id["ball"]
goal_final_id = self.sim.model._site_name2id["cup_goal_final"]
self._steps = 0
self._q_pos = []
self._q_vel = []
start_pos = init_pos_all
start_pos[0:7] = init_pos_robot
self.set_state(start_pos, init_vel)
return self._get_obs()
def step(self, a):
reward_dist = 0.0
angular_vel = 0.0
reward_ctrl = - np.square(a).sum()
crash = self.do_simulation(a)
joint_cons_viol = self.check_traj_in_joint_limits()
self._q_pos.append(self.sim.data.qpos[0:7].ravel().copy())
self._q_vel.append(self.sim.data.qvel[0:7].ravel().copy())
ob = self._get_obs()
if not crash and not joint_cons_viol:
reward, success, collision = self.reward_function.compute_reward(a, self.sim, self._steps)
done = success or self._steps == self.sim_steps - 1 or collision
self._steps += 1
else:
reward = -1000
success = False
done = True
return ob, reward, done, dict(reward_dist=reward_dist,
reward_ctrl=reward_ctrl,
velocity=angular_vel,
traj=self._q_pos, is_success=success,
is_collided=crash or joint_cons_viol)
def check_traj_in_joint_limits(self):
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
# TODO: extend observation space
def _get_obs(self):
theta = self.sim.data.qpos.flat[:7]
return np.concatenate([
np.cos(theta),
np.sin(theta),
# self.get_body_com("target"), # only return target to make problem harder
[self._steps],
])
def extend_des_pos(self, des_pos):
des_pos_full = self.start_pos.copy()
des_pos_full[1] = des_pos[0]
des_pos_full[3] = des_pos[1]
des_pos_full[5] = des_pos[2]
return des_pos_full
def extend_des_vel(self, des_vel):
des_vel_full = self.start_vel.copy()
des_vel_full[1] = des_vel[0]
des_vel_full[3] = des_vel[1]
des_vel_full[5] = des_vel[2]
return des_vel_full
if __name__ == "__main__":
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
env = ALRBallInACupEnv(reward_function=BallInACupReward)
env.configure(None)
env.reset()
env.render()
for i in range(4000):
# objective.load_result("/tmp/cma")
# test with random actions
# ac = 0.1 * env.action_space.sample()
# ac = -np.array([i, i, i]) / 10000 + np.array([env.start_pos[1], env.start_pos[3], env.start_pos[5]])
# ac = np.array([0., -10, 0, -1, 0, 1, 0])
ac = np.array([0.,0., 0, 0, 0, 0, 0])
# ac[0] += np.pi/2
obs, rew, d, info = env.step(ac)
env.render()
print(rew)
if d:
break
env.close()

View File

@ -1,7 +1,6 @@
from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper
from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv as ALRBallInACupEnvSimple
def make_contextual_env(rank, seed=0): def make_contextual_env(rank, seed=0):
@ -16,7 +15,7 @@ def make_contextual_env(rank, seed=0):
""" """
def _init(): def _init():
env = ALRBallInACupEnv() env = ALRBallInACupEnv(reward_type="contextual_goal")
env = DetPMPWrapper(env, env = DetPMPWrapper(env,
num_dof=7, num_dof=7,
@ -50,7 +49,7 @@ def make_env(rank, seed=0):
""" """
def _init(): def _init():
env = ALRBallInACupEnvSimple() env = ALRBallInACupEnv(reward_type="simple")
env = DetPMPWrapper(env, env = DetPMPWrapper(env,
num_dof=7, num_dof=7,
@ -84,7 +83,7 @@ def make_simple_env(rank, seed=0):
""" """
def _init(): def _init():
env = ALRBallInACupEnvSimple() env = ALRBallInACupEnv(reward_type="simple")
env = DetPMPWrapper(env, env = DetPMPWrapper(env,
num_dof=3, num_dof=3,
@ -119,7 +118,7 @@ def make_simple_dmp_env(rank, seed=0):
""" """
def _init(): def _init():
_env = ALRBallInACupEnvSimple() _env = ALRBallInACupEnv(reward_type="simple")
_env = DmpWrapper(_env, _env = DmpWrapper(_env,
num_dof=3, num_dof=3,

View File

@ -0,0 +1,82 @@
import gym
from gym.vector.async_vector_env import AsyncVectorEnv
import numpy as np
from _collections import defaultdict
def make_env(env_id, rank, seed=0):
env = gym.make(env_id)
env.seed(seed + rank)
return lambda: env
def split_array(ary, size):
n_samples = len(ary)
if n_samples < size:
tmp = np.zeros((size, ary.shape[1]))
tmp[0:n_samples] = ary
return [tmp]
elif n_samples == size:
return [ary]
else:
repeat = int(np.ceil(n_samples / size))
split = [k * size for k in range(1, repeat)]
sub_arys = np.split(ary, split)
if n_samples % repeat != 0:
tmp = np.zeros_like(sub_arys[0])
last = sub_arys[-1]
tmp[0: len(last)] = last
sub_arys[-1] = tmp
return sub_arys
def _flatten_list(l):
assert isinstance(l, (list, tuple))
assert len(l) > 0
assert all([len(l_) > 0 for l_ in l])
return [l__ for l_ in l for l__ in l_]
class AlrMpEnvSampler:
"""
An asynchronous sampler for MPWrapper environments. A sampler object can be called with a set of parameters and
returns the corresponding final obs, rewards, dones and info dicts.
"""
def __init__(self, env_id, num_envs, seed=0):
self.num_envs = num_envs
self.env = AsyncVectorEnv([make_env(env_id, seed, i) for i in range(num_envs)])
def __call__(self, params):
params = np.atleast_2d(params)
n_samples = params.shape[0]
split_params = split_array(params, self.num_envs)
vals = defaultdict(list)
for p in split_params:
obs, reward, done, info = self.env.step(p)
vals['obs'].append(obs)
vals['reward'].append(reward)
vals['done'].append(done)
vals['info'].append(info)
# do not return values above threshold
return np.vstack(vals['obs'])[:n_samples], np.hstack(vals['reward'])[:n_samples],\
_flatten_list(vals['done'])[:n_samples], _flatten_list(vals['info'])[:n_samples]
if __name__ == "__main__":
env_name = "alr_envs:HoleReacherDMP-v0"
n_cpu = 8
dim = 30
n_samples = 20
sampler = AlrMpEnvSampler(env_name, num_envs=n_cpu)
thetas = np.random.randn(n_samples, dim) # usually form a search distribution
_, rewards, __, ___ = sampler(thetas)
print(rewards)

View File

@ -11,8 +11,9 @@ class DmpWrapper(MPWrapper):
def __init__(self, env: gym.Env, num_dof: int, num_basis: int, start_pos: np.ndarray = None, def __init__(self, env: gym.Env, num_dof: int, num_basis: int, start_pos: np.ndarray = None,
final_pos: np.ndarray = None, duration: int = 1, alpha_phase: float = 2., dt: float = None, final_pos: np.ndarray = None, duration: int = 1, alpha_phase: float = 2., dt: float = None,
learn_goal: bool = False, post_traj_time: float = 0., policy_type: str = None, learn_goal: bool = False, return_to_start: bool = False, post_traj_time: float = 0.,
weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3.): weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3.,
policy_type: str = None):
""" """
This Wrapper generates a trajectory based on a DMP and will only return episodic performances. This Wrapper generates a trajectory based on a DMP and will only return episodic performances.
@ -34,8 +35,13 @@ class DmpWrapper(MPWrapper):
self.learn_goal = learn_goal self.learn_goal = learn_goal
dt = env.dt if hasattr(env, "dt") else dt dt = env.dt if hasattr(env, "dt") else dt
assert dt is not None assert dt is not None
start_pos = env.start_pos if hasattr(env, "start_pos") else start_pos start_pos = start_pos if start_pos is not None else env.start_pos if hasattr(env, "start_pos") else None
assert start_pos is not None assert start_pos is not None
if learn_goal:
final_pos = np.zeros_like(start_pos) # arbitrary, will be learned
else:
final_pos = final_pos if final_pos is not None else start_pos if return_to_start else None
assert final_pos is not None
self.t = np.linspace(0, duration, int(duration / dt)) self.t = np.linspace(0, duration, int(duration / dt))
self.goal_scale = goal_scale self.goal_scale = goal_scale

View File

@ -46,8 +46,9 @@ class MPWrapper(gym.Wrapper, ABC):
rewards = [] rewards = []
dones = [] dones = []
infos = [] infos = []
for p, c in zip(params, contexts): # for p, c in zip(params, contexts):
self.configure(c) for p in params:
# self.configure(c)
ob, reward, done, info = self.step(p) ob, reward, done, info = self.step(p)
obs.append(ob) obs.append(ob)
rewards.append(reward) rewards.append(reward)