diff --git a/mujoco_maze/__init__.py b/mujoco_maze/__init__.py index 525fc83..f92403f 100644 --- a/mujoco_maze/__init__.py +++ b/mujoco_maze/__init__.py @@ -5,11 +5,14 @@ from mujoco_maze.maze_task import TaskRegistry for maze_id in TaskRegistry.keys(): for i, task_cls in enumerate(TaskRegistry.tasks(maze_id)): - scaling = task_cls.SCALING.ant gym.envs.register( id=f"Ant{maze_id}-v{i}", entry_point="mujoco_maze.ant_maze_env:AntMazeEnv", - kwargs=dict(maze_task=task_cls, maze_size_scaling=scaling), + kwargs=dict( + maze_task=task_cls, + maze_size_scaling=task_cls.MAZE_SIZE_SCALING.ant, + inner_reward_scaling=task_cls.INNER_REWARD_SCALING, + ), max_episode_steps=1000, reward_threshold=task_cls.REWARD_THRESHOLD, ) @@ -19,7 +22,11 @@ for maze_id in TaskRegistry.keys(): gym.envs.register( id=f"Point{maze_id}-v{i}", entry_point="mujoco_maze.point_maze_env:PointMazeEnv", - kwargs=dict(maze_task=task_cls, maze_size_scaling=scaling), + kwargs=dict( + maze_task=task_cls, + maze_size_scaling=task_cls.MAZE_SIZE_SCALING.point, + inner_reward_scaling=task_cls.INNER_REWARD_SCALING, + ), max_episode_steps=1000, reward_threshold=task_cls.REWARD_THRESHOLD, ) diff --git a/mujoco_maze/ant.py b/mujoco_maze/ant.py index 5754209..96afb9b 100644 --- a/mujoco_maze/ant.py +++ b/mujoco_maze/ant.py @@ -16,13 +16,24 @@ """Wrapper for creating the ant environment in gym_mujoco.""" import math -from typing import Optional, Tuple +from typing import Callable, Optional, Tuple import numpy as np from mujoco_maze.agent_model import AgentModel +ForwardRewardFn = Callable[[float, float], float] + + +def forward_reward_vabs(xy_velocity: float) -> float: + return np.sum(np.abs(xy_velocity)) + + +def forward_reward_vnorm(xy_velocity: float) -> float: + return np.linalg.norm(xy_velocity) + + def q_inv(a): return [a[0], -a[1], -a[2], -a[3]] @@ -36,31 +47,37 @@ def q_mult(a, b): # multiply two quaternion class AntEnv(AgentModel): - FILE = "ant.xml" - ORI_IND = 3 + FILE: str = "ant.xml" + ORI_IND: int = 3 - def __init__(self, file_path: Optional[str] = None) -> None: + def __init__( + self, + file_path: Optional[str] = None, + ctrl_cost_weight: float = 0.5, + forward_reward_fn: ForwardRewardFn = forward_reward_vnorm, + ) -> None: + self._ctrl_cost_weight = ctrl_cost_weight + self._forward_reward_fn = forward_reward_fn super().__init__(file_path, 5) + def _forward_reward(self, xy_pos_before: np.ndarray) -> float: + xy_pos_after = self.sim.data.qpos[:2].copy() + xy_velocity = (xy_pos_after - xy_pos_before) / self.dt + return self._forward_reward_fn(xy_velocity) + def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, dict]: - xposbefore = self.get_body_com("torso")[0] + xy_pos_before = self.sim.data.qpos[:2].copy() self.do_simulation(action, self.frame_skip) - xposafter = self.get_body_com("torso")[0] - forward_reward = (xposafter - xposbefore) / self.dt - ctrl_cost = 0.5 * np.square(action).sum() - survive_reward = 1.0 - reward = forward_reward - ctrl_cost + survive_reward - _ = self.state_vector() + + forward_reward = self._forward_reward(xy_pos_before) + ctrl_cost = self._ctrl_cost_weight * np.square(action).sum() + ob = self._get_obs() return ( ob, - reward, + forward_reward - ctrl_cost, False, - dict( - reward_forward=forward_reward, - reward_ctrl=-ctrl_cost, - reward_survive=survive_reward, - ), + dict(reward_forward=forward_reward, reward_ctrl=-ctrl_cost,), ) def _get_obs(self): @@ -84,9 +101,6 @@ class AntEnv(AgentModel): self.set_state(qpos, qvel) return self._get_obs() - def viewer_setup(self): - self.viewer.cam.distance = self.model.stat.extent * 0.5 - def get_ori(self): ori = [0, 1, 0, 0] ori_ind = self.ORI_IND @@ -96,7 +110,7 @@ class AntEnv(AgentModel): return ori def set_xy(self, xy): - qpos = np.copy(self.sim.data.qpos) + qpos = self.sim.data.qpos.copy() qpos[0] = xy[0] qpos[1] = xy[1] diff --git a/mujoco_maze/maze_env.py b/mujoco_maze/maze_env.py index 007773f..edc39bb 100644 --- a/mujoco_maze/maze_env.py +++ b/mujoco_maze/maze_env.py @@ -44,6 +44,7 @@ class MazeEnv(gym.Env): top_down_view: float = False, maze_height: float = 0.5, maze_size_scaling: float = 4.0, + inner_reward_scaling: float = 1.0, *args, **kwargs, ) -> None: @@ -55,6 +56,7 @@ class MazeEnv(gym.Env): self._maze_height = height = maze_height self._maze_size_scaling = size_scaling = maze_size_scaling + self._inner_reward_scaling = inner_reward_scaling self.t = 0 # time steps self._n_bins = n_bins self._sensor_range = sensor_range * size_scaling @@ -495,9 +497,10 @@ class MazeEnv(gym.Env): def reset(self): self.t = 0 self.wrapped_env.reset() - # Sample a new goal + # Samples a new goal if self._task.sample_goals(): self.set_marker() + # Samples a new start position if len(self._init_positions) > 1: xy = np.random.choice(self._init_positions) self.wrapped_env.set_xy(xy) @@ -547,7 +550,7 @@ class MazeEnv(gym.Env): else: inner_next_obs, inner_reward, _, info = self.wrapped_env.step(action) next_obs = self._get_obs() - inner_reward = self._task.scale_inner_reward(inner_reward) + inner_reward = self._inner_reward_scaling * inner_reward outer_reward = self._task.reward(next_obs) done = self._task.termination(next_obs) return next_obs, inner_reward + outer_reward, done, info diff --git a/mujoco_maze/maze_task.py b/mujoco_maze/maze_task.py index a053ecd..b208a10 100644 --- a/mujoco_maze/maze_task.py +++ b/mujoco_maze/maze_task.py @@ -47,7 +47,8 @@ class Scaling(NamedTuple): class MazeTask(ABC): REWARD_THRESHOLD: float - SCALING: Scaling = Scaling(8.0, 4.0) + MAZE_SIZE_SCALING: Scaling = Scaling(8.0, 4.0) + INNER_REWARD_SCALING: float = 1e-4 OBSERVE_BLOCKS: bool = False PUT_SPIN_NEAR_AGENT: bool = False @@ -64,9 +65,6 @@ class MazeTask(ABC): return True return False - def scale_inner_reward(self, inner_reward: float) -> float: - return inner_reward - @abstractmethod def reward(self, obs: np.ndarray) -> float: pass @@ -102,6 +100,7 @@ class SingleGoalSparseUMaze(MazeTask): class SingleGoalDenseUMaze(SingleGoalSparseUMaze): REWARD_THRESHOLD: float = 1000.0 + def reward(self, obs: np.ndarray) -> float: return -self.goals[0].euc_dist(obs) @@ -126,6 +125,7 @@ class SingleGoalSparsePush(SingleGoalSparseUMaze): class SingleGoalDensePush(SingleGoalSparsePush): REWARD_THRESHOLD: float = 1000.0 + def reward(self, obs: np.ndarray) -> float: return -self.goals[0].euc_dist(obs) @@ -151,12 +151,14 @@ class SingleGoalSparseFall(SingleGoalSparseUMaze): class SingleGoalDenseFall(SingleGoalSparseFall): REWARD_THRESHOLD: float = 1000.0 + def reward(self, obs: np.ndarray) -> float: return -self.goals[0].euc_dist(obs) class SingleGoalSparse2Rooms(MazeTask): REWARD_THRESHOLD: float = 0.9 + SCALING: Scaling = Scaling(4.0, 4.0) def __init__(self, scale: float) -> None: super().__init__(scale) @@ -185,6 +187,7 @@ class SingleGoalSparse2Rooms(MazeTask): class SingleGoalDense2Rooms(SingleGoalSparse2Rooms): REWARD_THRESHOLD: float = 1000.0 + def reward(self, obs: np.ndarray) -> float: return -self.goals[0].euc_dist(obs) @@ -197,6 +200,7 @@ class SubGoalSparse2Rooms(SingleGoalSparse2Rooms): class SingleGoalSparse4Rooms(MazeTask): REWARD_THRESHOLD: float = 0.9 + MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0) def __init__(self, scale: float) -> None: super().__init__(scale) @@ -227,6 +231,7 @@ class SingleGoalSparse4Rooms(MazeTask): class SingleGoalDense4Rooms(SingleGoalSparse4Rooms): REWARD_THRESHOLD: float = 1000.0 + def reward(self, obs: np.ndarray) -> float: return -self.goals[0].euc_dist(obs) @@ -245,11 +250,7 @@ class TaskRegistry: "UMaze": [SingleGoalDenseUMaze, SingleGoalSparseUMaze], "Push": [SingleGoalDensePush, SingleGoalSparsePush], "Fall": [SingleGoalDenseFall, SingleGoalSparseFall], - "2Rooms": [ - SingleGoalDense2Rooms, - SingleGoalSparse2Rooms, - SubGoalSparse2Rooms, - ], + "2Rooms": [SingleGoalDense2Rooms, SingleGoalSparse2Rooms, SubGoalSparse2Rooms,], "4Rooms": [SingleGoalSparse4Rooms, SingleGoalDense4Rooms, SubGoalSparse4Rooms], } diff --git a/mujoco_maze/point.py b/mujoco_maze/point.py index a92d2d7..86351f3 100644 --- a/mujoco_maze/point.py +++ b/mujoco_maze/point.py @@ -25,9 +25,10 @@ from mujoco_maze.agent_model import AgentModel class PointEnv(AgentModel): + FILE: str = "point.xml" + ORI_IND: int = 2 + VELOCITY_LIMITS: float = 10.0 - FILE = "point.xml" - ORI_IND = 2 def __init__(self, file_path: Optional[str] = None): super().__init__(file_path, 1)