Add INNER_REWARD_SCALING and More flexible reward for ant
This commit is contained in:
parent
d5cc345080
commit
91249105b8
@ -5,11 +5,14 @@ from mujoco_maze.maze_task import TaskRegistry
|
||||
|
||||
for maze_id in TaskRegistry.keys():
|
||||
for i, task_cls in enumerate(TaskRegistry.tasks(maze_id)):
|
||||
scaling = task_cls.SCALING.ant
|
||||
gym.envs.register(
|
||||
id=f"Ant{maze_id}-v{i}",
|
||||
entry_point="mujoco_maze.ant_maze_env:AntMazeEnv",
|
||||
kwargs=dict(maze_task=task_cls, maze_size_scaling=scaling),
|
||||
kwargs=dict(
|
||||
maze_task=task_cls,
|
||||
maze_size_scaling=task_cls.MAZE_SIZE_SCALING.ant,
|
||||
inner_reward_scaling=task_cls.INNER_REWARD_SCALING,
|
||||
),
|
||||
max_episode_steps=1000,
|
||||
reward_threshold=task_cls.REWARD_THRESHOLD,
|
||||
)
|
||||
@ -19,7 +22,11 @@ for maze_id in TaskRegistry.keys():
|
||||
gym.envs.register(
|
||||
id=f"Point{maze_id}-v{i}",
|
||||
entry_point="mujoco_maze.point_maze_env:PointMazeEnv",
|
||||
kwargs=dict(maze_task=task_cls, maze_size_scaling=scaling),
|
||||
kwargs=dict(
|
||||
maze_task=task_cls,
|
||||
maze_size_scaling=task_cls.MAZE_SIZE_SCALING.point,
|
||||
inner_reward_scaling=task_cls.INNER_REWARD_SCALING,
|
||||
),
|
||||
max_episode_steps=1000,
|
||||
reward_threshold=task_cls.REWARD_THRESHOLD,
|
||||
)
|
||||
|
@ -16,13 +16,24 @@
|
||||
"""Wrapper for creating the ant environment in gym_mujoco."""
|
||||
|
||||
import math
|
||||
from typing import Optional, Tuple
|
||||
from typing import Callable, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from mujoco_maze.agent_model import AgentModel
|
||||
|
||||
|
||||
ForwardRewardFn = Callable[[float, float], float]
|
||||
|
||||
|
||||
def forward_reward_vabs(xy_velocity: float) -> float:
|
||||
return np.sum(np.abs(xy_velocity))
|
||||
|
||||
|
||||
def forward_reward_vnorm(xy_velocity: float) -> float:
|
||||
return np.linalg.norm(xy_velocity)
|
||||
|
||||
|
||||
def q_inv(a):
|
||||
return [a[0], -a[1], -a[2], -a[3]]
|
||||
|
||||
@ -36,31 +47,37 @@ def q_mult(a, b): # multiply two quaternion
|
||||
|
||||
|
||||
class AntEnv(AgentModel):
|
||||
FILE = "ant.xml"
|
||||
ORI_IND = 3
|
||||
FILE: str = "ant.xml"
|
||||
ORI_IND: int = 3
|
||||
|
||||
def __init__(self, file_path: Optional[str] = None) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str] = None,
|
||||
ctrl_cost_weight: float = 0.5,
|
||||
forward_reward_fn: ForwardRewardFn = forward_reward_vnorm,
|
||||
) -> None:
|
||||
self._ctrl_cost_weight = ctrl_cost_weight
|
||||
self._forward_reward_fn = forward_reward_fn
|
||||
super().__init__(file_path, 5)
|
||||
|
||||
def _forward_reward(self, xy_pos_before: np.ndarray) -> float:
|
||||
xy_pos_after = self.sim.data.qpos[:2].copy()
|
||||
xy_velocity = (xy_pos_after - xy_pos_before) / self.dt
|
||||
return self._forward_reward_fn(xy_velocity)
|
||||
|
||||
def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, dict]:
|
||||
xposbefore = self.get_body_com("torso")[0]
|
||||
xy_pos_before = self.sim.data.qpos[:2].copy()
|
||||
self.do_simulation(action, self.frame_skip)
|
||||
xposafter = self.get_body_com("torso")[0]
|
||||
forward_reward = (xposafter - xposbefore) / self.dt
|
||||
ctrl_cost = 0.5 * np.square(action).sum()
|
||||
survive_reward = 1.0
|
||||
reward = forward_reward - ctrl_cost + survive_reward
|
||||
_ = self.state_vector()
|
||||
|
||||
forward_reward = self._forward_reward(xy_pos_before)
|
||||
ctrl_cost = self._ctrl_cost_weight * np.square(action).sum()
|
||||
|
||||
ob = self._get_obs()
|
||||
return (
|
||||
ob,
|
||||
reward,
|
||||
forward_reward - ctrl_cost,
|
||||
False,
|
||||
dict(
|
||||
reward_forward=forward_reward,
|
||||
reward_ctrl=-ctrl_cost,
|
||||
reward_survive=survive_reward,
|
||||
),
|
||||
dict(reward_forward=forward_reward, reward_ctrl=-ctrl_cost,),
|
||||
)
|
||||
|
||||
def _get_obs(self):
|
||||
@ -84,9 +101,6 @@ class AntEnv(AgentModel):
|
||||
self.set_state(qpos, qvel)
|
||||
return self._get_obs()
|
||||
|
||||
def viewer_setup(self):
|
||||
self.viewer.cam.distance = self.model.stat.extent * 0.5
|
||||
|
||||
def get_ori(self):
|
||||
ori = [0, 1, 0, 0]
|
||||
ori_ind = self.ORI_IND
|
||||
@ -96,7 +110,7 @@ class AntEnv(AgentModel):
|
||||
return ori
|
||||
|
||||
def set_xy(self, xy):
|
||||
qpos = np.copy(self.sim.data.qpos)
|
||||
qpos = self.sim.data.qpos.copy()
|
||||
qpos[0] = xy[0]
|
||||
qpos[1] = xy[1]
|
||||
|
||||
|
@ -44,6 +44,7 @@ class MazeEnv(gym.Env):
|
||||
top_down_view: float = False,
|
||||
maze_height: float = 0.5,
|
||||
maze_size_scaling: float = 4.0,
|
||||
inner_reward_scaling: float = 1.0,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
@ -55,6 +56,7 @@ class MazeEnv(gym.Env):
|
||||
|
||||
self._maze_height = height = maze_height
|
||||
self._maze_size_scaling = size_scaling = maze_size_scaling
|
||||
self._inner_reward_scaling = inner_reward_scaling
|
||||
self.t = 0 # time steps
|
||||
self._n_bins = n_bins
|
||||
self._sensor_range = sensor_range * size_scaling
|
||||
@ -495,9 +497,10 @@ class MazeEnv(gym.Env):
|
||||
def reset(self):
|
||||
self.t = 0
|
||||
self.wrapped_env.reset()
|
||||
# Sample a new goal
|
||||
# Samples a new goal
|
||||
if self._task.sample_goals():
|
||||
self.set_marker()
|
||||
# Samples a new start position
|
||||
if len(self._init_positions) > 1:
|
||||
xy = np.random.choice(self._init_positions)
|
||||
self.wrapped_env.set_xy(xy)
|
||||
@ -547,7 +550,7 @@ class MazeEnv(gym.Env):
|
||||
else:
|
||||
inner_next_obs, inner_reward, _, info = self.wrapped_env.step(action)
|
||||
next_obs = self._get_obs()
|
||||
inner_reward = self._task.scale_inner_reward(inner_reward)
|
||||
inner_reward = self._inner_reward_scaling * inner_reward
|
||||
outer_reward = self._task.reward(next_obs)
|
||||
done = self._task.termination(next_obs)
|
||||
return next_obs, inner_reward + outer_reward, done, info
|
||||
|
@ -47,7 +47,8 @@ class Scaling(NamedTuple):
|
||||
|
||||
class MazeTask(ABC):
|
||||
REWARD_THRESHOLD: float
|
||||
SCALING: Scaling = Scaling(8.0, 4.0)
|
||||
MAZE_SIZE_SCALING: Scaling = Scaling(8.0, 4.0)
|
||||
INNER_REWARD_SCALING: float = 1e-4
|
||||
OBSERVE_BLOCKS: bool = False
|
||||
PUT_SPIN_NEAR_AGENT: bool = False
|
||||
|
||||
@ -64,9 +65,6 @@ class MazeTask(ABC):
|
||||
return True
|
||||
return False
|
||||
|
||||
def scale_inner_reward(self, inner_reward: float) -> float:
|
||||
return inner_reward
|
||||
|
||||
@abstractmethod
|
||||
def reward(self, obs: np.ndarray) -> float:
|
||||
pass
|
||||
@ -102,6 +100,7 @@ class SingleGoalSparseUMaze(MazeTask):
|
||||
class SingleGoalDenseUMaze(SingleGoalSparseUMaze):
|
||||
REWARD_THRESHOLD: float = 1000.0
|
||||
|
||||
|
||||
def reward(self, obs: np.ndarray) -> float:
|
||||
return -self.goals[0].euc_dist(obs)
|
||||
|
||||
@ -126,6 +125,7 @@ class SingleGoalSparsePush(SingleGoalSparseUMaze):
|
||||
class SingleGoalDensePush(SingleGoalSparsePush):
|
||||
REWARD_THRESHOLD: float = 1000.0
|
||||
|
||||
|
||||
def reward(self, obs: np.ndarray) -> float:
|
||||
return -self.goals[0].euc_dist(obs)
|
||||
|
||||
@ -151,12 +151,14 @@ class SingleGoalSparseFall(SingleGoalSparseUMaze):
|
||||
class SingleGoalDenseFall(SingleGoalSparseFall):
|
||||
REWARD_THRESHOLD: float = 1000.0
|
||||
|
||||
|
||||
def reward(self, obs: np.ndarray) -> float:
|
||||
return -self.goals[0].euc_dist(obs)
|
||||
|
||||
|
||||
class SingleGoalSparse2Rooms(MazeTask):
|
||||
REWARD_THRESHOLD: float = 0.9
|
||||
SCALING: Scaling = Scaling(4.0, 4.0)
|
||||
|
||||
def __init__(self, scale: float) -> None:
|
||||
super().__init__(scale)
|
||||
@ -185,6 +187,7 @@ class SingleGoalSparse2Rooms(MazeTask):
|
||||
class SingleGoalDense2Rooms(SingleGoalSparse2Rooms):
|
||||
REWARD_THRESHOLD: float = 1000.0
|
||||
|
||||
|
||||
def reward(self, obs: np.ndarray) -> float:
|
||||
return -self.goals[0].euc_dist(obs)
|
||||
|
||||
@ -197,6 +200,7 @@ class SubGoalSparse2Rooms(SingleGoalSparse2Rooms):
|
||||
|
||||
class SingleGoalSparse4Rooms(MazeTask):
|
||||
REWARD_THRESHOLD: float = 0.9
|
||||
MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0)
|
||||
|
||||
def __init__(self, scale: float) -> None:
|
||||
super().__init__(scale)
|
||||
@ -227,6 +231,7 @@ class SingleGoalSparse4Rooms(MazeTask):
|
||||
class SingleGoalDense4Rooms(SingleGoalSparse4Rooms):
|
||||
REWARD_THRESHOLD: float = 1000.0
|
||||
|
||||
|
||||
def reward(self, obs: np.ndarray) -> float:
|
||||
return -self.goals[0].euc_dist(obs)
|
||||
|
||||
@ -245,11 +250,7 @@ class TaskRegistry:
|
||||
"UMaze": [SingleGoalDenseUMaze, SingleGoalSparseUMaze],
|
||||
"Push": [SingleGoalDensePush, SingleGoalSparsePush],
|
||||
"Fall": [SingleGoalDenseFall, SingleGoalSparseFall],
|
||||
"2Rooms": [
|
||||
SingleGoalDense2Rooms,
|
||||
SingleGoalSparse2Rooms,
|
||||
SubGoalSparse2Rooms,
|
||||
],
|
||||
"2Rooms": [SingleGoalDense2Rooms, SingleGoalSparse2Rooms, SubGoalSparse2Rooms,],
|
||||
"4Rooms": [SingleGoalSparse4Rooms, SingleGoalDense4Rooms, SubGoalSparse4Rooms],
|
||||
}
|
||||
|
||||
|
@ -25,9 +25,10 @@ from mujoco_maze.agent_model import AgentModel
|
||||
|
||||
|
||||
class PointEnv(AgentModel):
|
||||
FILE: str = "point.xml"
|
||||
ORI_IND: int = 2
|
||||
|
||||
VELOCITY_LIMITS: float = 10.0
|
||||
FILE = "point.xml"
|
||||
ORI_IND = 2
|
||||
|
||||
def __init__(self, file_path: Optional[str] = None):
|
||||
super().__init__(file_path, 1)
|
||||
|
Loading…
Reference in New Issue
Block a user