From f1a96c055b7787831f494cbd41933a221dd46c97 Mon Sep 17 00:00:00 2001
From: Onur
Date: Sun, 29 May 2022 11:58:01 +0200
Subject: [PATCH] safety
---
alr_envs/alr/__init__.py | 90 ++++++++-
alr_envs/alr/mujoco/__init__.py | 2 +-
alr_envs/alr/mujoco/ant_jump/__init__.py | 1 +
.../alr/mujoco/ant_jump/new_mp_wrapper.py | 21 +++
.../assets/beerpong_wo_cup_big_table.xml | 2 +-
alr_envs/alr/mujoco/beerpong/beerpong.py | 29 ++-
.../mujoco/beerpong/beerpong_reward_staged.py | 154 ++++-----------
.../alr/mujoco/beerpong/new_mp_wrapper.py | 6 +
.../assets/{hopper.xml => hopper_jump.xml} | 4 +
.../alr/mujoco/hopper_jump/hopper_jump.py | 178 +++++++++++++++---
.../alr/mujoco/hopper_jump/new_mp_wrapper.py | 12 +-
alr_envs/mp/episodic_wrapper.py | 2 +-
12 files changed, 333 insertions(+), 168 deletions(-)
create mode 100644 alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py
rename alr_envs/alr/mujoco/hopper_jump/assets/{hopper.xml => hopper_jump.xml} (92%)
diff --git a/alr_envs/alr/__init__.py b/alr_envs/alr/__init__.py
index 1101020..9dc31ae 100644
--- a/alr_envs/alr/__init__.py
+++ b/alr_envs/alr/__init__.py
@@ -252,7 +252,8 @@ register(
max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMP,
kwargs={
"max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP,
- "context": False
+ "context": False,
+ "healthy_rewasrd": 1.0
}
)
register(
@@ -273,6 +274,17 @@ register(
"max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP
}
)
+
+register(
+ id='ALRHopperJump-v3',
+ entry_point='alr_envs.alr.mujoco:ALRHopperXYJumpEnv',
+ max_episode_steps=MAX_EPISODE_STEPS_HOPPERJUMP,
+ kwargs={
+ "max_episode_steps": MAX_EPISODE_STEPS_HOPPERJUMP,
+ "context": True,
+ "healthy_reward": 1.0
+ }
+)
# CtxtFree are v0, Contextual are v1
register(
id='ALRHopperJumpOnBox-v0',
@@ -723,6 +735,46 @@ for _v in _versions:
)
ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
+## AntJump
+_versions = ["v0", "v1"]
+for _v in _versions:
+ _env_id = f'ALRAntJumpProMP-{_v}'
+ register(
+ id= _env_id,
+ entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper',
+ kwargs={
+ "name": f"alr_envs:ALRAntJump-{_v}",
+ "wrappers": [mujoco.ant_jump.NewMPWrapper],
+ "ep_wrapper_kwargs": {
+ "weight_scale": 1
+ },
+ "movement_primitives_kwargs": {
+ 'movement_primitives_type': 'promp',
+ 'action_dim': 8
+ },
+ "phase_generator_kwargs": {
+ 'phase_generator_type': 'linear',
+ 'delay': 0,
+ 'tau': 10, # initial value
+ 'learn_tau': False,
+ 'learn_delay': False
+ },
+ "controller_kwargs": {
+ 'controller_type': 'motor',
+ "p_gains": np.ones(8),
+ "d_gains": 0.1*np.ones(8),
+ },
+ "basis_generator_kwargs": {
+ 'basis_generator_type': 'zero_rbf',
+ 'num_basis': 5,
+ 'num_basis_zero_start': 2
+ }
+ }
+ )
+ ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append(_env_id)
+
+
+
## HalfCheetahJump
_versions = ["v0", "v1"]
for _v in _versions:
@@ -877,6 +929,42 @@ register(
)
ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("ALRHopperJumpProMP-v2")
+
+## HopperJump
+register(
+ id= "ALRHopperJumpProMP-v3",
+ entry_point='alr_envs.utils.make_env_helpers:make_mp_env_helper',
+ kwargs={
+ "name": f"alr_envs:ALRHopperJump-v3",
+ "wrappers": [mujoco.hopper_jump.NewMPWrapper],
+ "ep_wrapper_kwargs": {
+ "weight_scale": 1
+ },
+ "movement_primitives_kwargs": {
+ 'movement_primitives_type': 'promp',
+ 'action_dim': 3
+ },
+ "phase_generator_kwargs": {
+ 'phase_generator_type': 'linear',
+ 'delay': 0,
+ 'tau': 2, # initial value
+ 'learn_tau': False,
+ 'learn_delay': False
+ },
+ "controller_kwargs": {
+ 'controller_type': 'motor',
+ "p_gains": np.ones(3),
+ "d_gains": 0.1*np.ones(3),
+ },
+ "basis_generator_kwargs": {
+ 'basis_generator_type': 'zero_rbf',
+ 'num_basis': 5,
+ 'num_basis_zero_start': 2
+ }
+ }
+)
+ALL_ALR_MOTION_PRIMITIVE_ENVIRONMENTS["ProMP"].append("ALRHopperJumpProMP-v3")
+
## HopperJumpOnBox
_versions = ["v0", "v1"]
for _v in _versions:
diff --git a/alr_envs/alr/mujoco/__init__.py b/alr_envs/alr/mujoco/__init__.py
index fb86186..5c04b03 100644
--- a/alr_envs/alr/mujoco/__init__.py
+++ b/alr_envs/alr/mujoco/__init__.py
@@ -5,7 +5,7 @@ from .table_tennis.tt_gym import TTEnvGym
from .beerpong.beerpong import ALRBeerBongEnv
from .ant_jump.ant_jump import ALRAntJumpEnv
from .half_cheetah_jump.half_cheetah_jump import ALRHalfCheetahJumpEnv
-from .hopper_jump.hopper_jump import ALRHopperJumpEnv, ALRHopperJumpRndmPosEnv
+from .hopper_jump.hopper_jump import ALRHopperJumpEnv, ALRHopperJumpRndmPosEnv, ALRHopperXYJumpEnv
from .hopper_jump.hopper_jump_on_box import ALRHopperJumpOnBoxEnv
from .hopper_throw.hopper_throw import ALRHopperThrowEnv
from .hopper_throw.hopper_throw_in_basket import ALRHopperThrowInBasketEnv
diff --git a/alr_envs/alr/mujoco/ant_jump/__init__.py b/alr_envs/alr/mujoco/ant_jump/__init__.py
index c5e6d2f..5d15867 100644
--- a/alr_envs/alr/mujoco/ant_jump/__init__.py
+++ b/alr_envs/alr/mujoco/ant_jump/__init__.py
@@ -1 +1,2 @@
from .mp_wrapper import MPWrapper
+from .new_mp_wrapper import NewMPWrapper
diff --git a/alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py b/alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py
new file mode 100644
index 0000000..c33048f
--- /dev/null
+++ b/alr_envs/alr/mujoco/ant_jump/new_mp_wrapper.py
@@ -0,0 +1,21 @@
+from alr_envs.mp.episodic_wrapper import EpisodicWrapper
+from typing import Union, Tuple
+import numpy as np
+
+
+
+class NewMPWrapper(EpisodicWrapper):
+
+ def set_active_obs(self):
+ return np.hstack([
+ [False] * 111, # ant has 111 dimensional observation space !!
+ [True] # goal height
+ ])
+
+ @property
+ def current_pos(self) -> Union[float, int, np.ndarray]:
+ return self.env.sim.data.qpos[7:15].copy()
+
+ @property
+ def current_vel(self) -> Union[float, int, np.ndarray, Tuple]:
+ return self.env.sim.data.qvel[6:14].copy()
diff --git a/alr_envs/alr/mujoco/beerpong/assets/beerpong_wo_cup_big_table.xml b/alr_envs/alr/mujoco/beerpong/assets/beerpong_wo_cup_big_table.xml
index 436b36c..78e2c32 100644
--- a/alr_envs/alr/mujoco/beerpong/assets/beerpong_wo_cup_big_table.xml
+++ b/alr_envs/alr/mujoco/beerpong/assets/beerpong_wo_cup_big_table.xml
@@ -144,7 +144,7 @@
solref="-10000 -100"/>
-
+
diff --git a/alr_envs/alr/mujoco/beerpong/beerpong.py b/alr_envs/alr/mujoco/beerpong/beerpong.py
index 36998b9..017bfcd 100644
--- a/alr_envs/alr/mujoco/beerpong/beerpong.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong.py
@@ -7,8 +7,12 @@ from gym.envs.mujoco import MujocoEnv
from alr_envs.alr.mujoco.beerpong.beerpong_reward_staged import BeerPongReward
-CUP_POS_MIN = np.array([-0.32, -2.2])
-CUP_POS_MAX = np.array([0.32, -1.2])
+CUP_POS_MIN = np.array([-1.42, -4.05])
+CUP_POS_MAX = np.array([1.42, -1.25])
+
+
+# CUP_POS_MIN = np.array([-0.32, -2.2])
+# CUP_POS_MAX = np.array([0.32, -1.2])
# smaller context space -> Easier task
# CUP_POS_MIN = np.array([-0.16, -2.2])
@@ -24,8 +28,10 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
self.cup_goal_pos = np.array(cup_goal_pos)
self._steps = 0
+ # self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
+ # "beerpong_wo_cup" + ".xml")
self.xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
- "beerpong_wo_cup" + ".xml")
+ "beerpong_wo_cup_big_table" + ".xml")
self.j_min = np.array([-2.6, -1.985, -2.8, -0.9, -4.55, -1.5707, -2.7])
self.j_max = np.array([2.6, 1.985, 2.8, 3.14159, 1.25, 1.5707, 2.7])
@@ -100,10 +106,6 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
return self._get_obs()
def step(self, a):
- # if a.shape[0] == 8: # we learn also when to release the ball
- # self._release_step = a[-1]
- # self._release_step = np.clip(self._release_step, 50, 250)
- # self.release_step = 0.5/self.dt
reward_dist = 0.0
angular_vel = 0.0
applied_action = a
@@ -170,16 +172,11 @@ class ALRBeerBongEnv(MujocoEnv, utils.EzPickle):
[self._steps],
])
- # TODO
- @property
- def active_obs(self):
- return np.hstack([
- [False] * 7, # cos
- [False] * 7, # sin
- [True] * 2, # xy position of cup
- [False] # env steps
- ])
+class ALRBeerPongStepEnv(ALRBeerBongEnv):
+ def __init__(self, frame_skip=1, apply_gravity_comp=True, noisy=False,
+ rndm_goal=False, cup_goal_pos=None):
+ super(ALRBeerPongStepEnv, self).__init__(frame_skip, apply_gravity_comp, noisy, rndm_goal, cup_goal_pos)
if __name__ == "__main__":
env = ALRBeerBongEnv(rndm_goal=True)
diff --git a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
index bb5dd3f..cb75127 100644
--- a/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
+++ b/alr_envs/alr/mujoco/beerpong/beerpong_reward_staged.py
@@ -40,6 +40,7 @@ class BeerPongReward:
self.cup_z_axes = None
self.collision_penalty = 500
self.reset(None)
+ self.is_initialized = False
def reset(self, noisy):
self.ball_traj = []
@@ -55,113 +56,60 @@ class BeerPongReward:
self.ball_wall_contact = False
self.ball_cup_contact = False
self.ball_in_cup = False
+ self.dist_ground_cup = -1 # distance floor to cup if first floor contact
self.noisy_bp = noisy
self._t_min_final_dist = -1
def compute_reward(self, env, action):
- self.ball_id = env.sim.model._body_name2id["ball"]
- self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"]
- self.goal_id = env.sim.model._site_name2id["cup_goal_table"]
- self.goal_final_id = env.sim.model._site_name2id["cup_goal_final_table"]
- self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects]
- self.cup_table_id = env.sim.model._body_name2id["cup_table"]
- self.table_collision_id = env.sim.model._geom_name2id["table_contact_geom"]
- self.wall_collision_id = env.sim.model._geom_name2id["wall"]
- self.cup_table_collision_id = env.sim.model._geom_name2id["cup_base_table_contact"]
- self.init_ball_pos_site_id = env.sim.model._site_name2id["init_ball_pos_site"]
- self.ground_collision_id = env.sim.model._geom_name2id["ground"]
- self.robot_collision_ids = [env.sim.model._geom_name2id[name] for name in self.robot_collision_objects]
+
+ if not self.is_initialized:
+ self.is_initialized = True
+ self.ball_id = env.sim.model._body_name2id["ball"]
+ self.ball_collision_id = env.sim.model._geom_name2id["ball_geom"]
+ self.goal_id = env.sim.model._site_name2id["cup_goal_table"]
+ self.goal_final_id = env.sim.model._site_name2id["cup_goal_final_table"]
+ self.cup_collision_ids = [env.sim.model._geom_name2id[name] for name in self.cup_collision_objects]
+ self.cup_table_id = env.sim.model._body_name2id["cup_table"]
+ self.table_collision_id = env.sim.model._geom_name2id["table_contact_geom"]
+ self.wall_collision_id = env.sim.model._geom_name2id["wall"]
+ self.cup_table_collision_id = env.sim.model._geom_name2id["cup_base_table_contact"]
+ self.init_ball_pos_site_id = env.sim.model._site_name2id["init_ball_pos_site"]
+ self.ground_collision_id = env.sim.model._geom_name2id["ground"]
+ self.robot_collision_ids = [env.sim.model._geom_name2id[name] for name in self.robot_collision_objects]
goal_pos = env.sim.data.site_xpos[self.goal_id]
ball_pos = env.sim.data.body_xpos[self.ball_id]
ball_vel = env.sim.data.body_xvelp[self.ball_id]
goal_final_pos = env.sim.data.site_xpos[self.goal_final_id]
+
+ self._check_contacts(env.sim)
self.dists.append(np.linalg.norm(goal_pos - ball_pos))
self.dists_final.append(np.linalg.norm(goal_final_pos - ball_pos))
-
+ self.dist_ground_cup = np.linalg.norm(ball_pos-goal_pos) \
+ if self.ball_ground_contact_first and self.dist_ground_cup == -1 else self.dist_ground_cup
action_cost = np.sum(np.square(action))
self.action_costs.append(action_cost)
- # ##################### Reward function which forces to bounce once on the table (tanh) ########################
- # if not self.ball_table_contact:
- # self.ball_table_contact = self._check_collision_single_objects(env.sim, self.ball_collision_id,
- # self.table_collision_id)
- #
- # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
- # if env._steps == env.ep_length - 1 or self._is_collided:
- # min_dist = np.min(self.dists)
- # final_dist = self.dists_final[-1]
- #
- # ball_in_cup = self._check_collision_single_objects(env.sim, self.ball_collision_id,
- # self.cup_table_collision_id)
- #
- # # encourage bounce before falling into cup
- # if not ball_in_cup:
- # if not self.ball_table_contact:
- # reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
- # else:
- # reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
- # else:
- # if not self.ball_table_contact:
- # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 1
- # else:
- # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
- #
- # # reward = - 1 * cost - self.collision_penalty * int(self._is_collided)
- # success = ball_in_cup
- # crash = self._is_collided
- # else:
- # reward = - 1e-2 * action_cost
- # success = False
- # crash = False
- # ################################################################################################################
-
- ##################### Reward function which does not force to bounce once on the table (tanh) ################
- # self._check_contacts(env.sim)
- # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
- # if env._steps == env.ep_length - 1 or self._is_collided:
- # min_dist = np.min(self.dists)
- # final_dist = self.dists_final[-1]
- #
- # # encourage bounce before falling into cup
- # if not self.ball_in_cup:
- # if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
- # min_dist_coeff, final_dist_coeff, rew_offset = 0.2, 0.1, 0
- # # reward = 0.2 * (1 - np.tanh(0.5*min_dist)) + 0.1 * (1 - np.tanh(0.5*final_dist))
- # else:
- # min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, 0
- # # reward = (1 - np.tanh(0.5*min_dist)) + 0.5 * (1 - np.tanh(0.5*final_dist))
- # else:
- # min_dist_coeff, final_dist_coeff, rew_offset = 1, 2, 3
- # # reward = 2 * (1 - np.tanh(0.5*final_dist)) + 1 * (1 - np.tanh(0.5*min_dist)) + 3
- #
- # reward = final_dist_coeff * (1 - np.tanh(0.5 * final_dist)) + min_dist_coeff * (1 - np.tanh(0.5 * min_dist)) \
- # + rew_offset
- # success = self.ball_in_cup
- # crash = self._is_collided
- # else:
- # reward = - 1e-2 * action_cost
- # success = False
- # crash = False
- ################################################################################################################
-
# # ##################### Reward function which does not force to bounce once on the table (quad dist) ############
- self._check_contacts(env.sim)
+
self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
+
if env._steps == env.ep_length - 1 or self._is_collided:
min_dist = np.min(self.dists)
final_dist = self.dists_final[-1]
- # if self.ball_ground_contact_first:
- # min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -6
- # else:
- if not self.ball_in_cup:
- if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
- min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -4
- else:
- min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -2
+ if self.ball_ground_contact_first:
+ min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 2, -4 # relative rew offset when first bounding on ground
+ # min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -6 # absolute rew offset when first bouncing on ground
else:
- min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, 0
+ if not self.ball_in_cup:
+ if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
+ min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -4
+ else:
+ min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 1, 0.5, 0, -2
+ else:
+ min_dist_coeff, final_dist_coeff, ground_contact_dist_coeff, rew_offset = 0, 1, 0 ,0
+ # dist_ground_cup = 1 * self.dist_ground_cup
reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \
- 1e-4 * np.mean(action_cost)
+ 1e-4 * np.mean(action_cost) - ground_contact_dist_coeff*self.dist_ground_cup ** 2
# 1e-7*np.mean(action_cost)
# release step punishment
min_time_bound = 0.1
@@ -172,43 +120,13 @@ class BeerPongReward:
reward += release_time_rew
success = self.ball_in_cup
# print('release time :', release_time)
+ # print('dist_ground_cup :', dist_ground_cup)
else:
reward = - 1e-2 * action_cost
# reward = - 1e-4 * action_cost
# reward = 0
success = False
# ################################################################################################################
-
- # # # ##################### Reward function which does not force to bounce once on the table (quad dist) ############
- # self._check_contacts(env.sim)
- # self._is_collided = self._check_collision_with_itself(env.sim, self.robot_collision_ids)
- # if env._steps == env.ep_length - 1 or self._is_collided:
- # min_dist = np.min(self.dists)
- # final_dist = self.dists_final[-1]
- #
- # if not self.ball_in_cup:
- # if not self.ball_table_contact and not self.ball_cup_contact and not self.ball_wall_contact:
- # min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -6
- # else:
- # if self.ball_ground_contact_first:
- # min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -4
- # else:
- # min_dist_coeff, final_dist_coeff, rew_offset = 1, 0.5, -2
- # else:
- # if self.ball_ground_contact_first:
- # min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, -1
- # else:
- # min_dist_coeff, final_dist_coeff, rew_offset = 0, 1, 0
- # reward = rew_offset - min_dist_coeff * min_dist ** 2 - final_dist_coeff * final_dist ** 2 - \
- # 1e-7 * np.mean(action_cost)
- # # 1e-4*np.mean(action_cost)
- # success = self.ball_in_cup
- # else:
- # # reward = - 1e-2 * action_cost
- # # reward = - 1e-4 * action_cost
- # reward = 0
- # success = False
- # ################################################################################################################
infos = {}
infos["success"] = success
infos["is_collided"] = self._is_collided
diff --git a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py
index 7c4ee9d..9683687 100644
--- a/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py
+++ b/alr_envs/alr/mujoco/beerpong/new_mp_wrapper.py
@@ -44,6 +44,12 @@ class NewMPWrapper(EpisodicWrapper):
else:
return action, None
+ def set_context(self, context):
+ xyz = np.zeros(3)
+ xyz[:2] = context
+ xyz[-1] = 0.840
+ self.env.env.model.body_pos[self.env.env.cup_table_id] = xyz
+ return self.get_observation_from_step(self.env.env._get_obs())
# def set_action_space(self):
# if self.mp.learn_tau:
# min_action_bounds, max_action_bounds = self.mp.get_param_bounds()
diff --git a/alr_envs/alr/mujoco/hopper_jump/assets/hopper.xml b/alr_envs/alr/mujoco/hopper_jump/assets/hopper_jump.xml
similarity index 92%
rename from alr_envs/alr/mujoco/hopper_jump/assets/hopper.xml
rename to alr_envs/alr/mujoco/hopper_jump/assets/hopper_jump.xml
index f18bc46..3348bab 100644
--- a/alr_envs/alr/mujoco/hopper_jump/assets/hopper.xml
+++ b/alr_envs/alr/mujoco/hopper_jump/assets/hopper_jump.xml
@@ -25,12 +25,16 @@
+
+
+