Add INNER_REWARD_SCALING and More flexible reward for ant
This commit is contained in:
parent
d5cc345080
commit
91249105b8
@ -5,11 +5,14 @@ from mujoco_maze.maze_task import TaskRegistry
|
|||||||
|
|
||||||
for maze_id in TaskRegistry.keys():
|
for maze_id in TaskRegistry.keys():
|
||||||
for i, task_cls in enumerate(TaskRegistry.tasks(maze_id)):
|
for i, task_cls in enumerate(TaskRegistry.tasks(maze_id)):
|
||||||
scaling = task_cls.SCALING.ant
|
|
||||||
gym.envs.register(
|
gym.envs.register(
|
||||||
id=f"Ant{maze_id}-v{i}",
|
id=f"Ant{maze_id}-v{i}",
|
||||||
entry_point="mujoco_maze.ant_maze_env:AntMazeEnv",
|
entry_point="mujoco_maze.ant_maze_env:AntMazeEnv",
|
||||||
kwargs=dict(maze_task=task_cls, maze_size_scaling=scaling),
|
kwargs=dict(
|
||||||
|
maze_task=task_cls,
|
||||||
|
maze_size_scaling=task_cls.MAZE_SIZE_SCALING.ant,
|
||||||
|
inner_reward_scaling=task_cls.INNER_REWARD_SCALING,
|
||||||
|
),
|
||||||
max_episode_steps=1000,
|
max_episode_steps=1000,
|
||||||
reward_threshold=task_cls.REWARD_THRESHOLD,
|
reward_threshold=task_cls.REWARD_THRESHOLD,
|
||||||
)
|
)
|
||||||
@ -19,7 +22,11 @@ for maze_id in TaskRegistry.keys():
|
|||||||
gym.envs.register(
|
gym.envs.register(
|
||||||
id=f"Point{maze_id}-v{i}",
|
id=f"Point{maze_id}-v{i}",
|
||||||
entry_point="mujoco_maze.point_maze_env:PointMazeEnv",
|
entry_point="mujoco_maze.point_maze_env:PointMazeEnv",
|
||||||
kwargs=dict(maze_task=task_cls, maze_size_scaling=scaling),
|
kwargs=dict(
|
||||||
|
maze_task=task_cls,
|
||||||
|
maze_size_scaling=task_cls.MAZE_SIZE_SCALING.point,
|
||||||
|
inner_reward_scaling=task_cls.INNER_REWARD_SCALING,
|
||||||
|
),
|
||||||
max_episode_steps=1000,
|
max_episode_steps=1000,
|
||||||
reward_threshold=task_cls.REWARD_THRESHOLD,
|
reward_threshold=task_cls.REWARD_THRESHOLD,
|
||||||
)
|
)
|
||||||
|
@ -16,13 +16,24 @@
|
|||||||
"""Wrapper for creating the ant environment in gym_mujoco."""
|
"""Wrapper for creating the ant environment in gym_mujoco."""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
from typing import Optional, Tuple
|
from typing import Callable, Optional, Tuple
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from mujoco_maze.agent_model import AgentModel
|
from mujoco_maze.agent_model import AgentModel
|
||||||
|
|
||||||
|
|
||||||
|
ForwardRewardFn = Callable[[float, float], float]
|
||||||
|
|
||||||
|
|
||||||
|
def forward_reward_vabs(xy_velocity: float) -> float:
|
||||||
|
return np.sum(np.abs(xy_velocity))
|
||||||
|
|
||||||
|
|
||||||
|
def forward_reward_vnorm(xy_velocity: float) -> float:
|
||||||
|
return np.linalg.norm(xy_velocity)
|
||||||
|
|
||||||
|
|
||||||
def q_inv(a):
|
def q_inv(a):
|
||||||
return [a[0], -a[1], -a[2], -a[3]]
|
return [a[0], -a[1], -a[2], -a[3]]
|
||||||
|
|
||||||
@ -36,31 +47,37 @@ def q_mult(a, b): # multiply two quaternion
|
|||||||
|
|
||||||
|
|
||||||
class AntEnv(AgentModel):
|
class AntEnv(AgentModel):
|
||||||
FILE = "ant.xml"
|
FILE: str = "ant.xml"
|
||||||
ORI_IND = 3
|
ORI_IND: int = 3
|
||||||
|
|
||||||
def __init__(self, file_path: Optional[str] = None) -> None:
|
def __init__(
|
||||||
|
self,
|
||||||
|
file_path: Optional[str] = None,
|
||||||
|
ctrl_cost_weight: float = 0.5,
|
||||||
|
forward_reward_fn: ForwardRewardFn = forward_reward_vnorm,
|
||||||
|
) -> None:
|
||||||
|
self._ctrl_cost_weight = ctrl_cost_weight
|
||||||
|
self._forward_reward_fn = forward_reward_fn
|
||||||
super().__init__(file_path, 5)
|
super().__init__(file_path, 5)
|
||||||
|
|
||||||
|
def _forward_reward(self, xy_pos_before: np.ndarray) -> float:
|
||||||
|
xy_pos_after = self.sim.data.qpos[:2].copy()
|
||||||
|
xy_velocity = (xy_pos_after - xy_pos_before) / self.dt
|
||||||
|
return self._forward_reward_fn(xy_velocity)
|
||||||
|
|
||||||
def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, dict]:
|
def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, dict]:
|
||||||
xposbefore = self.get_body_com("torso")[0]
|
xy_pos_before = self.sim.data.qpos[:2].copy()
|
||||||
self.do_simulation(action, self.frame_skip)
|
self.do_simulation(action, self.frame_skip)
|
||||||
xposafter = self.get_body_com("torso")[0]
|
|
||||||
forward_reward = (xposafter - xposbefore) / self.dt
|
forward_reward = self._forward_reward(xy_pos_before)
|
||||||
ctrl_cost = 0.5 * np.square(action).sum()
|
ctrl_cost = self._ctrl_cost_weight * np.square(action).sum()
|
||||||
survive_reward = 1.0
|
|
||||||
reward = forward_reward - ctrl_cost + survive_reward
|
|
||||||
_ = self.state_vector()
|
|
||||||
ob = self._get_obs()
|
ob = self._get_obs()
|
||||||
return (
|
return (
|
||||||
ob,
|
ob,
|
||||||
reward,
|
forward_reward - ctrl_cost,
|
||||||
False,
|
False,
|
||||||
dict(
|
dict(reward_forward=forward_reward, reward_ctrl=-ctrl_cost,),
|
||||||
reward_forward=forward_reward,
|
|
||||||
reward_ctrl=-ctrl_cost,
|
|
||||||
reward_survive=survive_reward,
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_obs(self):
|
def _get_obs(self):
|
||||||
@ -84,9 +101,6 @@ class AntEnv(AgentModel):
|
|||||||
self.set_state(qpos, qvel)
|
self.set_state(qpos, qvel)
|
||||||
return self._get_obs()
|
return self._get_obs()
|
||||||
|
|
||||||
def viewer_setup(self):
|
|
||||||
self.viewer.cam.distance = self.model.stat.extent * 0.5
|
|
||||||
|
|
||||||
def get_ori(self):
|
def get_ori(self):
|
||||||
ori = [0, 1, 0, 0]
|
ori = [0, 1, 0, 0]
|
||||||
ori_ind = self.ORI_IND
|
ori_ind = self.ORI_IND
|
||||||
@ -96,7 +110,7 @@ class AntEnv(AgentModel):
|
|||||||
return ori
|
return ori
|
||||||
|
|
||||||
def set_xy(self, xy):
|
def set_xy(self, xy):
|
||||||
qpos = np.copy(self.sim.data.qpos)
|
qpos = self.sim.data.qpos.copy()
|
||||||
qpos[0] = xy[0]
|
qpos[0] = xy[0]
|
||||||
qpos[1] = xy[1]
|
qpos[1] = xy[1]
|
||||||
|
|
||||||
|
@ -44,6 +44,7 @@ class MazeEnv(gym.Env):
|
|||||||
top_down_view: float = False,
|
top_down_view: float = False,
|
||||||
maze_height: float = 0.5,
|
maze_height: float = 0.5,
|
||||||
maze_size_scaling: float = 4.0,
|
maze_size_scaling: float = 4.0,
|
||||||
|
inner_reward_scaling: float = 1.0,
|
||||||
*args,
|
*args,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -55,6 +56,7 @@ class MazeEnv(gym.Env):
|
|||||||
|
|
||||||
self._maze_height = height = maze_height
|
self._maze_height = height = maze_height
|
||||||
self._maze_size_scaling = size_scaling = maze_size_scaling
|
self._maze_size_scaling = size_scaling = maze_size_scaling
|
||||||
|
self._inner_reward_scaling = inner_reward_scaling
|
||||||
self.t = 0 # time steps
|
self.t = 0 # time steps
|
||||||
self._n_bins = n_bins
|
self._n_bins = n_bins
|
||||||
self._sensor_range = sensor_range * size_scaling
|
self._sensor_range = sensor_range * size_scaling
|
||||||
@ -495,9 +497,10 @@ class MazeEnv(gym.Env):
|
|||||||
def reset(self):
|
def reset(self):
|
||||||
self.t = 0
|
self.t = 0
|
||||||
self.wrapped_env.reset()
|
self.wrapped_env.reset()
|
||||||
# Sample a new goal
|
# Samples a new goal
|
||||||
if self._task.sample_goals():
|
if self._task.sample_goals():
|
||||||
self.set_marker()
|
self.set_marker()
|
||||||
|
# Samples a new start position
|
||||||
if len(self._init_positions) > 1:
|
if len(self._init_positions) > 1:
|
||||||
xy = np.random.choice(self._init_positions)
|
xy = np.random.choice(self._init_positions)
|
||||||
self.wrapped_env.set_xy(xy)
|
self.wrapped_env.set_xy(xy)
|
||||||
@ -547,7 +550,7 @@ class MazeEnv(gym.Env):
|
|||||||
else:
|
else:
|
||||||
inner_next_obs, inner_reward, _, info = self.wrapped_env.step(action)
|
inner_next_obs, inner_reward, _, info = self.wrapped_env.step(action)
|
||||||
next_obs = self._get_obs()
|
next_obs = self._get_obs()
|
||||||
inner_reward = self._task.scale_inner_reward(inner_reward)
|
inner_reward = self._inner_reward_scaling * inner_reward
|
||||||
outer_reward = self._task.reward(next_obs)
|
outer_reward = self._task.reward(next_obs)
|
||||||
done = self._task.termination(next_obs)
|
done = self._task.termination(next_obs)
|
||||||
return next_obs, inner_reward + outer_reward, done, info
|
return next_obs, inner_reward + outer_reward, done, info
|
||||||
|
@ -47,7 +47,8 @@ class Scaling(NamedTuple):
|
|||||||
|
|
||||||
class MazeTask(ABC):
|
class MazeTask(ABC):
|
||||||
REWARD_THRESHOLD: float
|
REWARD_THRESHOLD: float
|
||||||
SCALING: Scaling = Scaling(8.0, 4.0)
|
MAZE_SIZE_SCALING: Scaling = Scaling(8.0, 4.0)
|
||||||
|
INNER_REWARD_SCALING: float = 1e-4
|
||||||
OBSERVE_BLOCKS: bool = False
|
OBSERVE_BLOCKS: bool = False
|
||||||
PUT_SPIN_NEAR_AGENT: bool = False
|
PUT_SPIN_NEAR_AGENT: bool = False
|
||||||
|
|
||||||
@ -64,9 +65,6 @@ class MazeTask(ABC):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def scale_inner_reward(self, inner_reward: float) -> float:
|
|
||||||
return inner_reward
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def reward(self, obs: np.ndarray) -> float:
|
def reward(self, obs: np.ndarray) -> float:
|
||||||
pass
|
pass
|
||||||
@ -102,6 +100,7 @@ class SingleGoalSparseUMaze(MazeTask):
|
|||||||
class SingleGoalDenseUMaze(SingleGoalSparseUMaze):
|
class SingleGoalDenseUMaze(SingleGoalSparseUMaze):
|
||||||
REWARD_THRESHOLD: float = 1000.0
|
REWARD_THRESHOLD: float = 1000.0
|
||||||
|
|
||||||
|
|
||||||
def reward(self, obs: np.ndarray) -> float:
|
def reward(self, obs: np.ndarray) -> float:
|
||||||
return -self.goals[0].euc_dist(obs)
|
return -self.goals[0].euc_dist(obs)
|
||||||
|
|
||||||
@ -126,6 +125,7 @@ class SingleGoalSparsePush(SingleGoalSparseUMaze):
|
|||||||
class SingleGoalDensePush(SingleGoalSparsePush):
|
class SingleGoalDensePush(SingleGoalSparsePush):
|
||||||
REWARD_THRESHOLD: float = 1000.0
|
REWARD_THRESHOLD: float = 1000.0
|
||||||
|
|
||||||
|
|
||||||
def reward(self, obs: np.ndarray) -> float:
|
def reward(self, obs: np.ndarray) -> float:
|
||||||
return -self.goals[0].euc_dist(obs)
|
return -self.goals[0].euc_dist(obs)
|
||||||
|
|
||||||
@ -151,12 +151,14 @@ class SingleGoalSparseFall(SingleGoalSparseUMaze):
|
|||||||
class SingleGoalDenseFall(SingleGoalSparseFall):
|
class SingleGoalDenseFall(SingleGoalSparseFall):
|
||||||
REWARD_THRESHOLD: float = 1000.0
|
REWARD_THRESHOLD: float = 1000.0
|
||||||
|
|
||||||
|
|
||||||
def reward(self, obs: np.ndarray) -> float:
|
def reward(self, obs: np.ndarray) -> float:
|
||||||
return -self.goals[0].euc_dist(obs)
|
return -self.goals[0].euc_dist(obs)
|
||||||
|
|
||||||
|
|
||||||
class SingleGoalSparse2Rooms(MazeTask):
|
class SingleGoalSparse2Rooms(MazeTask):
|
||||||
REWARD_THRESHOLD: float = 0.9
|
REWARD_THRESHOLD: float = 0.9
|
||||||
|
SCALING: Scaling = Scaling(4.0, 4.0)
|
||||||
|
|
||||||
def __init__(self, scale: float) -> None:
|
def __init__(self, scale: float) -> None:
|
||||||
super().__init__(scale)
|
super().__init__(scale)
|
||||||
@ -185,6 +187,7 @@ class SingleGoalSparse2Rooms(MazeTask):
|
|||||||
class SingleGoalDense2Rooms(SingleGoalSparse2Rooms):
|
class SingleGoalDense2Rooms(SingleGoalSparse2Rooms):
|
||||||
REWARD_THRESHOLD: float = 1000.0
|
REWARD_THRESHOLD: float = 1000.0
|
||||||
|
|
||||||
|
|
||||||
def reward(self, obs: np.ndarray) -> float:
|
def reward(self, obs: np.ndarray) -> float:
|
||||||
return -self.goals[0].euc_dist(obs)
|
return -self.goals[0].euc_dist(obs)
|
||||||
|
|
||||||
@ -197,6 +200,7 @@ class SubGoalSparse2Rooms(SingleGoalSparse2Rooms):
|
|||||||
|
|
||||||
class SingleGoalSparse4Rooms(MazeTask):
|
class SingleGoalSparse4Rooms(MazeTask):
|
||||||
REWARD_THRESHOLD: float = 0.9
|
REWARD_THRESHOLD: float = 0.9
|
||||||
|
MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0)
|
||||||
|
|
||||||
def __init__(self, scale: float) -> None:
|
def __init__(self, scale: float) -> None:
|
||||||
super().__init__(scale)
|
super().__init__(scale)
|
||||||
@ -227,6 +231,7 @@ class SingleGoalSparse4Rooms(MazeTask):
|
|||||||
class SingleGoalDense4Rooms(SingleGoalSparse4Rooms):
|
class SingleGoalDense4Rooms(SingleGoalSparse4Rooms):
|
||||||
REWARD_THRESHOLD: float = 1000.0
|
REWARD_THRESHOLD: float = 1000.0
|
||||||
|
|
||||||
|
|
||||||
def reward(self, obs: np.ndarray) -> float:
|
def reward(self, obs: np.ndarray) -> float:
|
||||||
return -self.goals[0].euc_dist(obs)
|
return -self.goals[0].euc_dist(obs)
|
||||||
|
|
||||||
@ -245,11 +250,7 @@ class TaskRegistry:
|
|||||||
"UMaze": [SingleGoalDenseUMaze, SingleGoalSparseUMaze],
|
"UMaze": [SingleGoalDenseUMaze, SingleGoalSparseUMaze],
|
||||||
"Push": [SingleGoalDensePush, SingleGoalSparsePush],
|
"Push": [SingleGoalDensePush, SingleGoalSparsePush],
|
||||||
"Fall": [SingleGoalDenseFall, SingleGoalSparseFall],
|
"Fall": [SingleGoalDenseFall, SingleGoalSparseFall],
|
||||||
"2Rooms": [
|
"2Rooms": [SingleGoalDense2Rooms, SingleGoalSparse2Rooms, SubGoalSparse2Rooms,],
|
||||||
SingleGoalDense2Rooms,
|
|
||||||
SingleGoalSparse2Rooms,
|
|
||||||
SubGoalSparse2Rooms,
|
|
||||||
],
|
|
||||||
"4Rooms": [SingleGoalSparse4Rooms, SingleGoalDense4Rooms, SubGoalSparse4Rooms],
|
"4Rooms": [SingleGoalSparse4Rooms, SingleGoalDense4Rooms, SubGoalSparse4Rooms],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,9 +25,10 @@ from mujoco_maze.agent_model import AgentModel
|
|||||||
|
|
||||||
|
|
||||||
class PointEnv(AgentModel):
|
class PointEnv(AgentModel):
|
||||||
|
FILE: str = "point.xml"
|
||||||
|
ORI_IND: int = 2
|
||||||
|
|
||||||
VELOCITY_LIMITS: float = 10.0
|
VELOCITY_LIMITS: float = 10.0
|
||||||
FILE = "point.xml"
|
|
||||||
ORI_IND = 2
|
|
||||||
|
|
||||||
def __init__(self, file_path: Optional[str] = None):
|
def __init__(self, file_path: Optional[str] = None):
|
||||||
super().__init__(file_path, 1)
|
super().__init__(file_path, 1)
|
||||||
|
Loading…
Reference in New Issue
Block a user