updates
This commit is contained in:
parent
e482fc09f0
commit
db001c411f
@ -71,6 +71,24 @@ register(
|
||||
}
|
||||
)
|
||||
|
||||
register(
|
||||
id='ALRBallInACupSimple-v0',
|
||||
entry_point='alr_envs.mujoco:ALRBallInACupEnv',
|
||||
max_episode_steps=4000,
|
||||
kwargs={
|
||||
"reward_type": "simple"
|
||||
}
|
||||
)
|
||||
|
||||
register(
|
||||
id='ALRBallInACupGoal-v0',
|
||||
entry_point='alr_envs.mujoco:ALRBallInACupEnv',
|
||||
max_episode_steps=4000,
|
||||
kwargs={
|
||||
"reward_type": "contextual_goal"
|
||||
}
|
||||
)
|
||||
|
||||
# Classic control
|
||||
|
||||
register(
|
||||
@ -180,18 +198,36 @@ register(
|
||||
}
|
||||
)
|
||||
|
||||
# register(
|
||||
# id='HoleReacherDetPMP-v0',
|
||||
# entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp',
|
||||
# # max_episode_steps=1,
|
||||
# # TODO: add mp kwargs
|
||||
# )
|
||||
|
||||
register(
|
||||
id='HoleReacherDetPMP-v0',
|
||||
entry_point='alr_envs.classic_control.hole_reacher:holereacher_detpmp',
|
||||
# max_episode_steps=1,
|
||||
# TODO: add mp kwargs
|
||||
id='ALRBallInACupSimpleDMP-v0',
|
||||
entry_point='alr_envs.utils.make_env_helpers:make_dmp_env',
|
||||
kwargs={
|
||||
"name": "alr_envs:ALRBallInACupSimple-v0",
|
||||
"num_dof": 3,
|
||||
"num_basis": 5,
|
||||
"duration": 3.5,
|
||||
"post_traj_time": 4.5,
|
||||
"learn_goal": False,
|
||||
"alpha_phase": 3,
|
||||
"bandwidth_factor": 2.5,
|
||||
"policy_type": "motor",
|
||||
"weights_scale": 100,
|
||||
"return_to_start": True
|
||||
}
|
||||
)
|
||||
|
||||
register(
|
||||
id='BiacSimpleDMP-v0',
|
||||
id='ALRBallInACupGoalDMP-v0',
|
||||
entry_point='alr_envs.utils.make_env_helpers:make_dmp_env',
|
||||
kwargs={
|
||||
"name": "alr_envs:HoleReacher-v0",
|
||||
"name": "alr_envs:ALRBallInACupGoal-v0",
|
||||
"num_dof": 5,
|
||||
"num_basis": 5,
|
||||
"duration": 2,
|
||||
|
@ -1,2 +1,3 @@
|
||||
from alr_envs.mujoco.reacher.alr_reacher import ALRReacherEnv
|
||||
from alr_envs.mujoco.balancing import BalancingEnv
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv
|
||||
|
@ -67,6 +67,9 @@ class AlrMujocoEnv(gym.Env):
|
||||
self.init_qpos = self.sim.data.qpos.ravel().copy()
|
||||
self.init_qvel = self.sim.data.qvel.ravel().copy()
|
||||
|
||||
self._start_pos = None
|
||||
self._start_vel = None
|
||||
|
||||
self._set_action_space()
|
||||
|
||||
# action = self.action_space.sample()
|
||||
@ -93,6 +96,20 @@ class AlrMujocoEnv(gym.Env):
|
||||
"""
|
||||
return self.sim.data.qvel
|
||||
|
||||
@property
|
||||
def start_pos(self):
|
||||
"""
|
||||
Start position of the agent, for example joint angles of a Panda robot. Necessary for MP wrapped envs.
|
||||
"""
|
||||
return self._start_pos
|
||||
|
||||
@property
|
||||
def start_vel(self):
|
||||
"""
|
||||
Start velocity of the agent. Necessary for MP wrapped envs.
|
||||
"""
|
||||
return self._start_vel
|
||||
|
||||
def extend_des_pos(self, des_pos):
|
||||
"""
|
||||
In a simplified environment, the actions may only control a subset of all the joints.
|
||||
|
@ -1 +1 @@
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv
|
||||
|
||||
|
@ -5,15 +5,12 @@ from alr_envs.mujoco import alr_mujoco_env
|
||||
|
||||
|
||||
class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||
def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None):
|
||||
def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_type: str = None, context: np.ndarray = None):
|
||||
self._steps = 0
|
||||
|
||||
self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
|
||||
"biac_base" + ".xml")
|
||||
|
||||
self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
|
||||
self.start_vel = np.zeros(7)
|
||||
|
||||
self._q_pos = []
|
||||
self._q_vel = []
|
||||
# self.weight_matrix_scale = 50
|
||||
@ -31,13 +28,21 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||
self.xml_path,
|
||||
apply_gravity_comp=apply_gravity_comp,
|
||||
n_substeps=n_substeps)
|
||||
self._start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
|
||||
self._start_vel = np.zeros(7)
|
||||
|
||||
self.sim_time = 8 # seconds
|
||||
self.sim_steps = int(self.sim_time / self.dt)
|
||||
if reward_function is None:
|
||||
if reward_type == "simple":
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
|
||||
reward_function = BallInACupReward
|
||||
elif reward_type == "contextual_goal":
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward import BallInACupReward
|
||||
reward_function = BallInACupReward
|
||||
else:
|
||||
raise ValueError("Unknown reward type")
|
||||
self.reward_function = reward_function(self.sim_steps)
|
||||
self.configure(context)
|
||||
|
||||
@property
|
||||
def current_pos(self):
|
||||
@ -97,6 +102,7 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||
def check_traj_in_joint_limits(self):
|
||||
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
|
||||
|
||||
# TODO: extend observation space
|
||||
def _get_obs(self):
|
||||
theta = self.sim.data.qpos.flat[:7]
|
||||
return np.concatenate([
|
||||
@ -106,6 +112,20 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||
[self._steps],
|
||||
])
|
||||
|
||||
# These functions are for the task with 3 joint actuations
|
||||
def extend_des_pos(self, des_pos):
|
||||
des_pos_full = self.start_pos.copy()
|
||||
des_pos_full[1] = des_pos[0]
|
||||
des_pos_full[3] = des_pos[1]
|
||||
des_pos_full[5] = des_pos[2]
|
||||
return des_pos_full
|
||||
|
||||
def extend_des_vel(self, des_vel):
|
||||
des_vel_full = self.start_vel.copy()
|
||||
des_vel_full[1] = des_vel[0]
|
||||
des_vel_full[3] = des_vel[1]
|
||||
des_vel_full[5] = des_vel[2]
|
||||
return des_vel_full
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -40,9 +40,12 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
||||
self.ball_in_cup = False
|
||||
self.ball_above_threshold = False
|
||||
self.dist_ctxt = 3
|
||||
self.action_costs = []
|
||||
self.cup_angles = []
|
||||
|
||||
def compute_reward(self, action, sim, step):
|
||||
action_cost = np.sum(np.square(action))
|
||||
self.action_costs.append(action_cost)
|
||||
|
||||
stop_sim = False
|
||||
success = False
|
||||
@ -54,7 +57,7 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
||||
self.collision_ids = [sim.model._geom_name2id[name] for name in self.collision_objects]
|
||||
|
||||
if self.check_collision(sim):
|
||||
reward = - 1e-4 * action_cost - 1000
|
||||
reward = - 1e-3 * action_cost - 1000
|
||||
stop_sim = True
|
||||
return reward, success, stop_sim
|
||||
|
||||
@ -65,8 +68,11 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
||||
self.dists.append(np.linalg.norm(goal_pos - ball_pos))
|
||||
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
|
||||
self.dists_ctxt.append(np.linalg.norm(ball_pos - self.context))
|
||||
self.ball_traj[step, :] = ball_pos
|
||||
self.cup_traj[step, :] = goal_pos
|
||||
self.ball_traj[step, :] = np.copy(ball_pos)
|
||||
self.cup_traj[step, :] = np.copy(goal_pos) # ?
|
||||
cup_quat = np.copy(sim.data.body_xquat[sim.model._body_name2id["cup"]])
|
||||
self.cup_angles.append(np.arctan2(2 * (cup_quat[0] * cup_quat[1] + cup_quat[2] * cup_quat[3]),
|
||||
1 - 2 * (cup_quat[1] ** 2 + cup_quat[2] ** 2)))
|
||||
|
||||
# Determine the first time when ball is in cup
|
||||
if not self.ball_in_cup:
|
||||
@ -77,25 +83,29 @@ class BallInACupReward(alr_reward_fct.AlrReward):
|
||||
self.dist_ctxt = dist_to_ctxt
|
||||
|
||||
if step == self.sim_time - 1:
|
||||
t_min_dist = np.argmin(self.dists)
|
||||
angle_min_dist = self.cup_angles[t_min_dist]
|
||||
cost_angle = (angle_min_dist - np.pi / 2) ** 2
|
||||
|
||||
min_dist = np.min(self.dists)
|
||||
dist_final = self.dists_final[-1]
|
||||
# dist_ctxt = self.dists_ctxt[-1]
|
||||
|
||||
# max distance between ball and cup and cup height at that time
|
||||
ball_to_cup_diff = self.ball_traj[:, 2] - self.cup_traj[:, 2]
|
||||
t_max_diff = np.argmax(ball_to_cup_diff)
|
||||
t_max_ball_height = np.argmax(self.ball_traj[:, 2])
|
||||
max_ball_height = np.max(self.ball_traj[:, 2])
|
||||
# # max distance between ball and cup and cup height at that time
|
||||
# ball_to_cup_diff = self.ball_traj[:, 2] - self.cup_traj[:, 2]
|
||||
# t_max_diff = np.argmax(ball_to_cup_diff)
|
||||
# t_max_ball_height = np.argmax(self.ball_traj[:, 2])
|
||||
# max_ball_height = np.max(self.ball_traj[:, 2])
|
||||
|
||||
# cost = self._get_stage_wise_cost(ball_in_cup, min_dist, dist_final, dist_ctxt)
|
||||
cost = 0.3 * min_dist + 0.3 * dist_final + 0.3 * np.minimum(self.dist_ctxt, 3)
|
||||
reward = np.exp(-1 * cost) - 1e-4 * action_cost
|
||||
if max_ball_height < self.context[2] or ball_to_cup_diff[t_max_ball_height] < 0:
|
||||
reward -= 1
|
||||
cost = 0.5 * min_dist + 0.5 * dist_final + 0.3 * np.minimum(self.dist_ctxt, 3) + 0.01 * cost_angle
|
||||
reward = np.exp(-2 * cost) - 1e-3 * action_cost
|
||||
# if max_ball_height < self.context[2] or ball_to_cup_diff[t_max_ball_height] < 0:
|
||||
# reward -= 1
|
||||
|
||||
success = dist_final < 0.05 and self.dist_ctxt < 0.05
|
||||
else:
|
||||
reward = - 1e-4 * action_cost
|
||||
reward = - 1e-3 * action_cost
|
||||
success = False
|
||||
|
||||
return reward, success, stop_sim
|
||||
|
@ -1,151 +0,0 @@
|
||||
from alr_envs.mujoco import alr_mujoco_env
|
||||
from gym import utils
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
|
||||
class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
|
||||
def __init__(self, n_substeps=4, apply_gravity_comp=True, reward_function=None):
|
||||
self._steps = 0
|
||||
|
||||
self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
|
||||
"biac_base" + ".xml")
|
||||
|
||||
self.start_pos = np.array([0.0, 0.58760536, 0.0, 1.36004913, 0.0, -0.32072943, -1.57])
|
||||
self.start_vel = np.zeros(7)
|
||||
|
||||
self._q_pos = []
|
||||
self._q_vel = []
|
||||
# self.weight_matrix_scale = 50
|
||||
self.max_ctrl = np.array([150., 125., 40., 60., 5., 5., 2.])
|
||||
self.p_gains = 1 / self.max_ctrl * np.array([200, 300, 100, 100, 10, 10, 2.5])
|
||||
self.d_gains = 1 / self.max_ctrl * np.array([7, 15, 5, 2.5, 0.3, 0.3, 0.05])
|
||||
|
||||
self.j_min = np.array([-2.6, -1.985, -2.8, -0.9, -4.55, -1.5707, -2.7])
|
||||
self.j_max = np.array([2.6, 1.985, 2.8, 3.14159, 1.25, 1.5707, 2.7])
|
||||
|
||||
self.context = None
|
||||
|
||||
utils.EzPickle.__init__(self)
|
||||
alr_mujoco_env.AlrMujocoEnv.__init__(self,
|
||||
self.xml_path,
|
||||
apply_gravity_comp=apply_gravity_comp,
|
||||
n_substeps=n_substeps)
|
||||
|
||||
self.sim_time = 8 # seconds
|
||||
self.sim_steps = int(self.sim_time / self.dt)
|
||||
if reward_function is None:
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
|
||||
reward_function = BallInACupReward
|
||||
self.reward_function = reward_function(self.sim_steps)
|
||||
|
||||
@property
|
||||
def current_pos(self):
|
||||
return self.sim.data.qpos[0:7].copy()
|
||||
|
||||
@property
|
||||
def current_vel(self):
|
||||
return self.sim.data.qvel[0:7].copy()
|
||||
|
||||
def configure(self, context):
|
||||
self.context = context
|
||||
self.reward_function.reset(context)
|
||||
|
||||
def reset_model(self):
|
||||
init_pos_all = self.init_qpos.copy()
|
||||
init_pos_robot = self.start_pos
|
||||
init_vel = np.zeros_like(init_pos_all)
|
||||
ball_id = self.sim.model._body_name2id["ball"]
|
||||
goal_final_id = self.sim.model._site_name2id["cup_goal_final"]
|
||||
|
||||
self._steps = 0
|
||||
self._q_pos = []
|
||||
self._q_vel = []
|
||||
|
||||
start_pos = init_pos_all
|
||||
start_pos[0:7] = init_pos_robot
|
||||
|
||||
self.set_state(start_pos, init_vel)
|
||||
|
||||
return self._get_obs()
|
||||
|
||||
def step(self, a):
|
||||
reward_dist = 0.0
|
||||
angular_vel = 0.0
|
||||
reward_ctrl = - np.square(a).sum()
|
||||
|
||||
crash = self.do_simulation(a)
|
||||
joint_cons_viol = self.check_traj_in_joint_limits()
|
||||
|
||||
self._q_pos.append(self.sim.data.qpos[0:7].ravel().copy())
|
||||
self._q_vel.append(self.sim.data.qvel[0:7].ravel().copy())
|
||||
|
||||
ob = self._get_obs()
|
||||
|
||||
if not crash and not joint_cons_viol:
|
||||
reward, success, collision = self.reward_function.compute_reward(a, self.sim, self._steps)
|
||||
done = success or self._steps == self.sim_steps - 1 or collision
|
||||
self._steps += 1
|
||||
else:
|
||||
reward = -1000
|
||||
success = False
|
||||
done = True
|
||||
return ob, reward, done, dict(reward_dist=reward_dist,
|
||||
reward_ctrl=reward_ctrl,
|
||||
velocity=angular_vel,
|
||||
traj=self._q_pos, is_success=success,
|
||||
is_collided=crash or joint_cons_viol)
|
||||
|
||||
def check_traj_in_joint_limits(self):
|
||||
return any(self.current_pos > self.j_max) or any(self.current_pos < self.j_min)
|
||||
|
||||
# TODO: extend observation space
|
||||
def _get_obs(self):
|
||||
theta = self.sim.data.qpos.flat[:7]
|
||||
return np.concatenate([
|
||||
np.cos(theta),
|
||||
np.sin(theta),
|
||||
# self.get_body_com("target"), # only return target to make problem harder
|
||||
[self._steps],
|
||||
])
|
||||
|
||||
def extend_des_pos(self, des_pos):
|
||||
des_pos_full = self.start_pos.copy()
|
||||
des_pos_full[1] = des_pos[0]
|
||||
des_pos_full[3] = des_pos[1]
|
||||
des_pos_full[5] = des_pos[2]
|
||||
return des_pos_full
|
||||
|
||||
def extend_des_vel(self, des_vel):
|
||||
des_vel_full = self.start_vel.copy()
|
||||
des_vel_full[1] = des_vel[0]
|
||||
des_vel_full[3] = des_vel[1]
|
||||
des_vel_full[5] = des_vel[2]
|
||||
return des_vel_full
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_reward_simple import BallInACupReward
|
||||
|
||||
env = ALRBallInACupEnv(reward_function=BallInACupReward)
|
||||
env.configure(None)
|
||||
env.reset()
|
||||
env.render()
|
||||
for i in range(4000):
|
||||
# objective.load_result("/tmp/cma")
|
||||
# test with random actions
|
||||
# ac = 0.1 * env.action_space.sample()
|
||||
# ac = -np.array([i, i, i]) / 10000 + np.array([env.start_pos[1], env.start_pos[3], env.start_pos[5]])
|
||||
# ac = np.array([0., -10, 0, -1, 0, 1, 0])
|
||||
ac = np.array([0.,0., 0, 0, 0, 0, 0])
|
||||
# ac[0] += np.pi/2
|
||||
obs, rew, d, info = env.step(ac)
|
||||
env.render()
|
||||
|
||||
print(rew)
|
||||
|
||||
if d:
|
||||
break
|
||||
|
||||
env.close()
|
||||
|
@ -1,7 +1,6 @@
|
||||
from alr_envs.utils.wrapper.detpmp_wrapper import DetPMPWrapper
|
||||
from alr_envs.utils.wrapper.dmp_wrapper import DmpWrapper
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup import ALRBallInACupEnv
|
||||
from alr_envs.mujoco.ball_in_a_cup.ball_in_a_cup_simple import ALRBallInACupEnv as ALRBallInACupEnvSimple
|
||||
|
||||
|
||||
def make_contextual_env(rank, seed=0):
|
||||
@ -16,7 +15,7 @@ def make_contextual_env(rank, seed=0):
|
||||
"""
|
||||
|
||||
def _init():
|
||||
env = ALRBallInACupEnv()
|
||||
env = ALRBallInACupEnv(reward_type="contextual_goal")
|
||||
|
||||
env = DetPMPWrapper(env,
|
||||
num_dof=7,
|
||||
@ -50,7 +49,7 @@ def make_env(rank, seed=0):
|
||||
"""
|
||||
|
||||
def _init():
|
||||
env = ALRBallInACupEnvSimple()
|
||||
env = ALRBallInACupEnv(reward_type="simple")
|
||||
|
||||
env = DetPMPWrapper(env,
|
||||
num_dof=7,
|
||||
@ -84,7 +83,7 @@ def make_simple_env(rank, seed=0):
|
||||
"""
|
||||
|
||||
def _init():
|
||||
env = ALRBallInACupEnvSimple()
|
||||
env = ALRBallInACupEnv(reward_type="simple")
|
||||
|
||||
env = DetPMPWrapper(env,
|
||||
num_dof=3,
|
||||
@ -119,7 +118,7 @@ def make_simple_dmp_env(rank, seed=0):
|
||||
"""
|
||||
|
||||
def _init():
|
||||
_env = ALRBallInACupEnvSimple()
|
||||
_env = ALRBallInACupEnv(reward_type="simple")
|
||||
|
||||
_env = DmpWrapper(_env,
|
||||
num_dof=3,
|
||||
|
82
alr_envs/utils/mp_env_async_sampler.py
Normal file
82
alr_envs/utils/mp_env_async_sampler.py
Normal file
@ -0,0 +1,82 @@
|
||||
import gym
|
||||
from gym.vector.async_vector_env import AsyncVectorEnv
|
||||
import numpy as np
|
||||
from _collections import defaultdict
|
||||
|
||||
|
||||
def make_env(env_id, rank, seed=0):
|
||||
env = gym.make(env_id)
|
||||
env.seed(seed + rank)
|
||||
return lambda: env
|
||||
|
||||
|
||||
def split_array(ary, size):
|
||||
n_samples = len(ary)
|
||||
if n_samples < size:
|
||||
tmp = np.zeros((size, ary.shape[1]))
|
||||
tmp[0:n_samples] = ary
|
||||
return [tmp]
|
||||
elif n_samples == size:
|
||||
return [ary]
|
||||
else:
|
||||
repeat = int(np.ceil(n_samples / size))
|
||||
split = [k * size for k in range(1, repeat)]
|
||||
sub_arys = np.split(ary, split)
|
||||
|
||||
if n_samples % repeat != 0:
|
||||
tmp = np.zeros_like(sub_arys[0])
|
||||
last = sub_arys[-1]
|
||||
tmp[0: len(last)] = last
|
||||
sub_arys[-1] = tmp
|
||||
|
||||
return sub_arys
|
||||
|
||||
|
||||
def _flatten_list(l):
|
||||
assert isinstance(l, (list, tuple))
|
||||
assert len(l) > 0
|
||||
assert all([len(l_) > 0 for l_ in l])
|
||||
|
||||
return [l__ for l_ in l for l__ in l_]
|
||||
|
||||
|
||||
class AlrMpEnvSampler:
|
||||
"""
|
||||
An asynchronous sampler for MPWrapper environments. A sampler object can be called with a set of parameters and
|
||||
returns the corresponding final obs, rewards, dones and info dicts.
|
||||
"""
|
||||
def __init__(self, env_id, num_envs, seed=0):
|
||||
self.num_envs = num_envs
|
||||
self.env = AsyncVectorEnv([make_env(env_id, seed, i) for i in range(num_envs)])
|
||||
|
||||
def __call__(self, params):
|
||||
params = np.atleast_2d(params)
|
||||
n_samples = params.shape[0]
|
||||
split_params = split_array(params, self.num_envs)
|
||||
|
||||
vals = defaultdict(list)
|
||||
for p in split_params:
|
||||
obs, reward, done, info = self.env.step(p)
|
||||
vals['obs'].append(obs)
|
||||
vals['reward'].append(reward)
|
||||
vals['done'].append(done)
|
||||
vals['info'].append(info)
|
||||
|
||||
# do not return values above threshold
|
||||
return np.vstack(vals['obs'])[:n_samples], np.hstack(vals['reward'])[:n_samples],\
|
||||
_flatten_list(vals['done'])[:n_samples], _flatten_list(vals['info'])[:n_samples]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
env_name = "alr_envs:HoleReacherDMP-v0"
|
||||
n_cpu = 8
|
||||
dim = 30
|
||||
n_samples = 20
|
||||
|
||||
sampler = AlrMpEnvSampler(env_name, num_envs=n_cpu)
|
||||
|
||||
thetas = np.random.randn(n_samples, dim) # usually form a search distribution
|
||||
|
||||
_, rewards, __, ___ = sampler(thetas)
|
||||
|
||||
print(rewards)
|
@ -11,8 +11,9 @@ class DmpWrapper(MPWrapper):
|
||||
|
||||
def __init__(self, env: gym.Env, num_dof: int, num_basis: int, start_pos: np.ndarray = None,
|
||||
final_pos: np.ndarray = None, duration: int = 1, alpha_phase: float = 2., dt: float = None,
|
||||
learn_goal: bool = False, post_traj_time: float = 0., policy_type: str = None,
|
||||
weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3.):
|
||||
learn_goal: bool = False, return_to_start: bool = False, post_traj_time: float = 0.,
|
||||
weights_scale: float = 1., goal_scale: float = 1., bandwidth_factor: float = 3.,
|
||||
policy_type: str = None):
|
||||
|
||||
"""
|
||||
This Wrapper generates a trajectory based on a DMP and will only return episodic performances.
|
||||
@ -34,8 +35,13 @@ class DmpWrapper(MPWrapper):
|
||||
self.learn_goal = learn_goal
|
||||
dt = env.dt if hasattr(env, "dt") else dt
|
||||
assert dt is not None
|
||||
start_pos = env.start_pos if hasattr(env, "start_pos") else start_pos
|
||||
start_pos = start_pos if start_pos is not None else env.start_pos if hasattr(env, "start_pos") else None
|
||||
assert start_pos is not None
|
||||
if learn_goal:
|
||||
final_pos = np.zeros_like(start_pos) # arbitrary, will be learned
|
||||
else:
|
||||
final_pos = final_pos if final_pos is not None else start_pos if return_to_start else None
|
||||
assert final_pos is not None
|
||||
self.t = np.linspace(0, duration, int(duration / dt))
|
||||
self.goal_scale = goal_scale
|
||||
|
||||
|
@ -46,8 +46,9 @@ class MPWrapper(gym.Wrapper, ABC):
|
||||
rewards = []
|
||||
dones = []
|
||||
infos = []
|
||||
for p, c in zip(params, contexts):
|
||||
self.configure(c)
|
||||
# for p, c in zip(params, contexts):
|
||||
for p in params:
|
||||
# self.configure(c)
|
||||
ob, reward, done, info = self.step(p)
|
||||
obs.append(ob)
|
||||
rewards.append(reward)
|
||||
|
Loading…
Reference in New Issue
Block a user