diff --git a/README.md b/README.md index 5d921e5..21d2de9 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # mujoco-maze +[![Black](https://img.shields.io/badge/code%20style-black-000.svg)](https://github.com/psf/black) Some maze environments for reinforcement learning(RL) using [mujoco-py] and [openai gym][gym]. @@ -22,19 +23,26 @@ Thankfully, this project is based on the code from [rllab] and [tensorflow/mode - PointPush/AntPush - ![PointPush](./screenshots/PointPush.png) + ![PointPush](./screenshots/AntPush.png) - PointPush-v0/AntPush-v0 (Distance-based Reward) - PointPush-v1/AntPush-v1 (Goal-based Reward) - PointFall/AntFall - ![PointFall](./screenshots/PointFall.png) + ![PointFall](./screenshots/AntFall.png) - PointFall-v0/AntFall-v0 (Distance-based Reward) - PointFall-v1/AntFall-v1 (Goal-based Reward) +- PointBilliard + + ![PointBilliard](./screenshots/PointBilliard.png) + - PointBilliard-v0 (Distance-based Reward) + - PointBilliard-v1 (Goal-based Reward) + - PointBilliard-v2 (Multiple Goals (0.5 pt or 1.0 pt)) + ## Warning -This project has some other environments (e.g., billiard, reacher, and -swimmer) but if they are not on README, they are work in progress and +This project has some other environments (e.g., reacher, and swimmer) +but if they are not on README, they are work in progress and not tested well. ## License diff --git a/mujoco_maze/__init__.py b/mujoco_maze/__init__.py index b468a1c..65f18c5 100644 --- a/mujoco_maze/__init__.py +++ b/mujoco_maze/__init__.py @@ -16,69 +16,66 @@ from mujoco_maze.swimmer import SwimmerEnv for maze_id in TaskRegistry.keys(): for i, task_cls in enumerate(TaskRegistry.tasks(maze_id)): - # Point - gym.envs.register( - id=f"Point{maze_id}-v{i}", - entry_point="mujoco_maze.maze_env:MazeEnv", - kwargs=dict( - model_cls=PointEnv, - maze_task=task_cls, - maze_size_scaling=task_cls.MAZE_SIZE_SCALING.point, - inner_reward_scaling=task_cls.INNER_REWARD_SCALING, - ), - max_episode_steps=1000, - reward_threshold=task_cls.REWARD_THRESHOLD, - ) - if "Billiard" in maze_id: - continue - # Ant - gym.envs.register( - id=f"Ant{maze_id}-v{i}", - entry_point="mujoco_maze.maze_env:MazeEnv", - kwargs=dict( - model_cls=AntEnv, - maze_task=task_cls, - maze_size_scaling=task_cls.MAZE_SIZE_SCALING.ant, - inner_reward_scaling=task_cls.INNER_REWARD_SCALING, - ), - max_episode_steps=1000, - reward_threshold=task_cls.REWARD_THRESHOLD, - ) - skip_swimmer = False - for inhibited in ["Fall", "Push", "Block"]: - if inhibited in maze_id: - skip_swimmer = True + point_scale = task_cls.MAZE_SIZE_SCALING.point + if point_scale is not None: + # Point + gym.envs.register( + id=f"Point{maze_id}-v{i}", + entry_point="mujoco_maze.maze_env:MazeEnv", + kwargs=dict( + model_cls=PointEnv, + maze_task=task_cls, + maze_size_scaling=point_scale, + inner_reward_scaling=task_cls.INNER_REWARD_SCALING, + ), + max_episode_steps=1000, + reward_threshold=task_cls.REWARD_THRESHOLD, + ) - if skip_swimmer: - continue + ant_scale = task_cls.MAZE_SIZE_SCALING.ant + if ant_scale is not None: + # Ant + gym.envs.register( + id=f"Ant{maze_id}-v{i}", + entry_point="mujoco_maze.maze_env:MazeEnv", + kwargs=dict( + model_cls=AntEnv, + maze_task=task_cls, + maze_size_scaling=ant_scale, + inner_reward_scaling=task_cls.INNER_REWARD_SCALING, + ), + max_episode_steps=1000, + reward_threshold=task_cls.REWARD_THRESHOLD, + ) - # Reacher - gym.envs.register( - id=f"Reacher{maze_id}-v{i}", - entry_point="mujoco_maze.maze_env:MazeEnv", - kwargs=dict( - model_cls=ReacherEnv, - maze_task=task_cls, - maze_size_scaling=task_cls.MAZE_SIZE_SCALING.swimmer, - inner_reward_scaling=task_cls.INNER_REWARD_SCALING, - ), - max_episode_steps=1000, - reward_threshold=task_cls.REWARD_THRESHOLD, - ) - - # Swimmer - gym.envs.register( - id=f"Swimmer{maze_id}-v{i}", - entry_point="mujoco_maze.maze_env:MazeEnv", - kwargs=dict( - model_cls=SwimmerEnv, - maze_task=task_cls, - maze_size_scaling=task_cls.MAZE_SIZE_SCALING.swimmer, - inner_reward_scaling=task_cls.INNER_REWARD_SCALING, - ), - max_episode_steps=1000, - reward_threshold=task_cls.REWARD_THRESHOLD, - ) + swimmer_scale = task_cls.MAZE_SIZE_SCALING.swimmer + if swimmer_scale is not None: + # Reacher + gym.envs.register( + id=f"Reacher{maze_id}-v{i}", + entry_point="mujoco_maze.maze_env:MazeEnv", + kwargs=dict( + model_cls=ReacherEnv, + maze_task=task_cls, + maze_size_scaling=task_cls.MAZE_SIZE_SCALING.swimmer, + inner_reward_scaling=task_cls.INNER_REWARD_SCALING, + ), + max_episode_steps=1000, + reward_threshold=task_cls.REWARD_THRESHOLD, + ) + # Swimmer + gym.envs.register( + id=f"Swimmer{maze_id}-v{i}", + entry_point="mujoco_maze.maze_env:MazeEnv", + kwargs=dict( + model_cls=SwimmerEnv, + maze_task=task_cls, + maze_size_scaling=task_cls.MAZE_SIZE_SCALING.swimmer, + inner_reward_scaling=task_cls.INNER_REWARD_SCALING, + ), + max_episode_steps=1000, + reward_threshold=task_cls.REWARD_THRESHOLD, + ) __version__ = "0.1.0" diff --git a/mujoco_maze/assets/ant.xml b/mujoco_maze/assets/ant.xml index ac555ef..ffe156b 100755 --- a/mujoco_maze/assets/ant.xml +++ b/mujoco_maze/assets/ant.xml @@ -11,13 +11,13 @@ - + - + diff --git a/mujoco_maze/assets/point.xml b/mujoco_maze/assets/point.xml index 578fce2..4c06cb1 100755 --- a/mujoco_maze/assets/point.xml +++ b/mujoco_maze/assets/point.xml @@ -8,7 +8,7 @@ - + @@ -18,8 +18,8 @@ - - + + diff --git a/mujoco_maze/assets/reacher.xml b/mujoco_maze/assets/reacher.xml index f9acaeb..0d238c8 100644 --- a/mujoco_maze/assets/reacher.xml +++ b/mujoco_maze/assets/reacher.xml @@ -8,7 +8,7 @@ - + diff --git a/mujoco_maze/assets/swimmer.xml b/mujoco_maze/assets/swimmer.xml index b743d85..3c6c21a 100644 --- a/mujoco_maze/assets/swimmer.xml +++ b/mujoco_maze/assets/swimmer.xml @@ -8,7 +8,7 @@ - + diff --git a/mujoco_maze/maze_env.py b/mujoco_maze/maze_env.py index 01586a3..7619c35 100644 --- a/mujoco_maze/maze_env.py +++ b/mujoco_maze/maze_env.py @@ -72,8 +72,9 @@ class MazeEnv(gym.Env): self._collision = maze_env_utils.CollisionDetector( structure, size_scaling, torso_x, torso_y, model_cls.RADIUS, ) + # Now all object balls have size=1.0 self._objball_collision = maze_env_utils.CollisionDetector( - structure, size_scaling, torso_x, torso_y, 0.8, + structure, size_scaling, torso_x, torso_y, self._task.OBJECT_BALL_SIZE, ) else: self._collision = None @@ -145,7 +146,7 @@ class MazeEnv(gym.Env): elif struct.is_object_ball(): # Movable Ball self.object_balls.append(f"objball_{i}_{j}") - _add_object_ball(worldbody, i, j, x, y) + _add_object_ball(worldbody, i, j, x, y, self._task.OBJECT_BALL_SIZE) torso = tree.find(".//body[@name='torso']") geoms = torso.findall(".//geom") @@ -166,7 +167,7 @@ class MazeEnv(gym.Env): name=f"goal_site{i}", pos=f"{goal.pos[0]} {goal.pos[1]} {z}", size=f"{maze_size_scaling * 0.1}", - rgba=goal.rbga_str(), + rgba=goal.rgb.rgba_str(), ) _, file_path = tempfile.mkstemp(text=True, suffix=".xml") @@ -385,6 +386,7 @@ class MazeEnv(gym.Env): self.wrapped_env.set_xy(old_pos) else: self.wrapped_env.set_xy(pos) + # Do the same check for object balls for name, old, new in zip(self.object_balls, old_objballs, new_objballs): collision = self._objball_collision.detect(old, new) if collision is not None: @@ -406,20 +408,23 @@ class MazeEnv(gym.Env): self.wrapped_env.close() -def _add_object_ball(worldbody: ET.Element, i: str, j: str, x: float, y: float) -> None: - body = ET.SubElement(worldbody, "body", name=f"objball_{i}_{j}", pos=f"{x} {y} 0",) +def _add_object_ball( + worldbody: ET.Element, i: str, j: str, x: float, y: float, size: float +) -> None: + body = ET.SubElement(worldbody, "body", name=f"objball_{i}_{j}", pos=f"{x} {y} 0") + mass = 0.0001 * (size ** 3) ET.SubElement( body, "geom", type="sphere", name=f"objball_{i}_{j}_geom", - size="1.0", # Radius - pos="0.0 0.0 1.0", # Z = 1.0 so that this ball can move!! - rgba="0.1 0.1 0.7 1", + size=f"{size}", # Radius + pos=f"0.0 0.0 {size}", # Z = size so that this ball can move!! + rgba=maze_task.BLUE.rgba_str(), contype="1", conaffinity="1", solimp="0.9 0.99 0.001", - mass="0.0001", + mass=f"{mass}", ) ET.SubElement( body, diff --git a/mujoco_maze/maze_task.py b/mujoco_maze/maze_task.py index eecd3b2..e953cb1 100644 --- a/mujoco_maze/maze_task.py +++ b/mujoco_maze/maze_task.py @@ -14,6 +14,9 @@ class Rgb(NamedTuple): green: float blue: float + def rgba_str(self) -> str: + return f"{self.red} {self.green} {self.blue} 1" + RED = Rgb(0.7, 0.1, 0.1) GREEN = Rgb(0.1, 0.7, 0.1) @@ -37,10 +40,6 @@ class MazeGoal: self.threshold = threshold self.custom_size = custom_size - def rbga_str(self) -> str: - r, g, b = self.rgb - return f"{r} {g} {b} 1" - def neighbor(self, obs: np.ndarray) -> float: return np.linalg.norm(obs[: self.dim] - self.pos) <= self.threshold @@ -49,9 +48,9 @@ class MazeGoal: class Scaling(NamedTuple): - ant: float - point: float - swimmer: float + ant: Optional[float] + point: Optional[float] + swimmer: Optional[float] class MazeTask(ABC): @@ -59,10 +58,14 @@ class MazeTask(ABC): PENALTY: Optional[float] = None MAZE_SIZE_SCALING: Scaling = Scaling(8.0, 4.0, 4.0) INNER_REWARD_SCALING: float = 0.01 - TOP_DOWN_VIEW: bool = False + # For Fall/Push/BlockMaze OBSERVE_BLOCKS: bool = False + # For Billiard OBSERVE_BALLS: bool = False + OBJECT_BALL_SIZE: float = 1.0 + # Unused now PUT_SPIN_NEAR_AGENT: bool = False + TOP_DOWN_VIEW: bool = False def __init__(self, scale: float) -> None: self.goals = [] @@ -143,7 +146,7 @@ class DistRewardSimpleRoom(GoalRewardSimpleRoom, DistRewardMixIn): class GoalRewardPush(GoalRewardUMaze): - TOP_DOWN_VIEW = True + OBSERVE_BLOCKS: bool = True def __init__(self, scale: float) -> None: super().__init__(scale) @@ -166,7 +169,7 @@ class DistRewardPush(GoalRewardPush, DistRewardMixIn): class GoalRewardFall(GoalRewardUMaze): - TOP_DOWN_VIEW = True + OBSERVE_BLOCKS: bool = True def __init__(self, scale: float) -> None: super().__init__(scale) @@ -195,9 +198,9 @@ class GoalReward2Rooms(MazeTask): PENALTY: float = -0.0001 MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0, 4.0) - def __init__(self, scale: float) -> None: + def __init__(self, scale: float, goal: Tuple[int, int] = (4.0, -2.0)) -> None: super().__init__(scale) - self.goals = [MazeGoal(np.array([0.0, 4.0 * scale]))] + self.goals = [MazeGoal(np.array(goal) * scale)] def reward(self, obs: np.ndarray) -> float: for goal in self.goals: @@ -210,10 +213,10 @@ class GoalReward2Rooms(MazeTask): E, B, R = MazeCell.EMPTY, MazeCell.BLOCK, MazeCell.ROBOT return [ [B, B, B, B, B, B, B, B], - [B, R, E, E, E, E, E, B], - [B, E, E, E, E, E, E, B], - [B, B, B, B, B, E, B, B], - [B, E, E, E, E, E, E, B], + [B, E, E, E, B, E, E, B], + [B, E, E, E, B, E, E, B], + [B, E, R, E, B, E, E, B], + [B, E, E, E, B, E, E, B], [B, E, E, E, E, E, E, B], [B, B, B, B, B, B, B, B], ] @@ -224,9 +227,17 @@ class DistReward2Rooms(GoalReward2Rooms, DistRewardMixIn): class SubGoal2Rooms(GoalReward2Rooms): - def __init__(self, scale: float) -> None: - super().__init__(scale) - self.goals.append(MazeGoal(np.array([5.0 * scale, 0.0 * scale]), 0.5, GREEN)) + def __init__( + self, + scale: float, + primary_goal: Tuple[float, float] = (4.0, -2.0), + subgoals: List[Tuple[float, float]] = [(1.0, -2.0), (-1.0, 2.0)], + ) -> None: + super().__init__(scale, primary_goal) + for subgoal in subgoals: + self.goals.append( + MazeGoal(np.array(subgoal) * scale, reward_scale=0.5, rgb=GREEN) + ) class GoalReward4Rooms(MazeTask): @@ -305,7 +316,21 @@ class DistRewardTRoom(GoalRewardTRoom, DistRewardMixIn): pass +class SubGoalTRoom(GoalRewardTRoom): + def __init__( + self, + scale: float, + primary_goal: Tuple[float, float] = (2.0, -3.0), + subgoal: Tuple[float, float] = (-2.0, -3.0), + ) -> None: + super().__init__(scale, primary_goal) + self.goals.append( + MazeGoal(np.array(subgoal) * scale, reward_scale=0.5, rgb=GREEN) + ) + + class GoalRewardBlockMaze(GoalRewardUMaze): + MAZE_SIZE_SCALING: Scaling = Scaling(8.0, 4.0, None) OBSERVE_BLOCKS: bool = True def __init__(self, scale: float) -> None: @@ -333,30 +358,46 @@ class DistRewardBlockMaze(GoalRewardBlockMaze, DistRewardMixIn): class GoalRewardBilliard(MazeTask): REWARD_THRESHOLD: float = 0.9 PENALTY: float = -0.0001 - MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 3.0, 3.0) + MAZE_SIZE_SCALING: Scaling = Scaling(None, 3.0, None) OBSERVE_BALLS: bool = True + GOAL_SIZE: float = 0.3 - def __init__(self, scale: float, goal: Tuple[float, float] = (1.0, -2.0)) -> None: + def __init__(self, scale: float, goal: Tuple[float, float] = (2.0, -3.0)) -> None: super().__init__(scale) goal = np.array(goal) * scale - self.goals = [MazeGoal(goal, threshold=1.25, custom_size=0.25)] + self.goals.append( + MazeGoal(goal, threshold=self._threshold(), custom_size=self.GOAL_SIZE) + ) + + def _threshold(self) -> float: + return self.OBJECT_BALL_SIZE + self.GOAL_SIZE def reward(self, obs: np.ndarray) -> float: - return 1.0 if self.termination(obs) else self.PENALTY + object_pos = obs[3:6] + for goal in self.goals: + if goal.neighbor(object_pos): + return goal.reward_scale + return self.PENALTY def termination(self, obs: np.ndarray) -> bool: - return super().termination(obs[3:6]) + object_pos = obs[3:6] + for goal in self.goals: + if goal.neighbor(object_pos): + return True + return False @staticmethod def create_maze() -> List[List[MazeCell]]: E, B = MazeCell.EMPTY, MazeCell.BLOCK R, M = MazeCell.ROBOT, MazeCell.OBJECT_BALL return [ - [B, B, B, B, B], - [B, E, E, E, B], - [B, E, M, E, B], - [B, E, R, E, B], - [B, B, B, B, B], + [B, B, B, B, B, B, B], + [B, E, E, E, E, E, B], + [B, E, E, E, E, E, B], + [B, E, E, M, E, E, B], + [B, E, E, R, E, E, B], + [B, E, E, E, E, E, B], + [B, B, B, B, B, B, B], ] @@ -365,6 +406,50 @@ class DistRewardBilliard(GoalRewardBilliard): return -self.goals[0].euc_dist(obs[3:6]) / self.scale +class SubGoalBilliard(GoalRewardBilliard): + def __init__( + self, + scale: float, + primary_goal: Tuple[float, float] = (2.0, -3.0), + subgoals: List[Tuple[float, float]] = [(-2.0, -3.0), (-2.0, 1.0), (2.0, 1.0)], + ) -> None: + super().__init__(scale, primary_goal) + for subgoal in subgoals: + self.goals.append( + MazeGoal( + np.array(subgoal) * scale, + reward_scale=0.5, + rgb=GREEN, + threshold=self._threshold(), + custom_size=self.GOAL_SIZE, + ) + ) + + +class BanditBilliard(SubGoalBilliard): + def __init__( + self, + scale: float, + primary_goal: Tuple[float, float] = (4.0, -2.0), + subgoals: List[Tuple[float, float]] = [(4.0, 2.0)], + ) -> None: + super().__init__(scale, primary_goal, subgoals) + + @staticmethod + def create_maze() -> List[List[MazeCell]]: + E, B = MazeCell.EMPTY, MazeCell.BLOCK + R, M = MazeCell.ROBOT, MazeCell.OBJECT_BALL + return [ + [B, B, B, B, B, B, B], + [B, E, E, B, B, E, B], + [B, E, E, E, E, E, B], + [B, R, M, E, B, B, B], + [B, E, E, E, E, E, B], + [B, E, E, E, E, E, B], + [B, B, B, B, B, B, B], + ] + + class TaskRegistry: REGISTRY: Dict[str, List[Type[MazeTask]]] = { "SimpleRoom": [DistRewardSimpleRoom, GoalRewardSimpleRoom], @@ -373,9 +458,14 @@ class TaskRegistry: "Fall": [DistRewardFall, GoalRewardFall], "2Rooms": [DistReward2Rooms, GoalReward2Rooms, SubGoal2Rooms], "4Rooms": [DistReward4Rooms, GoalReward4Rooms, SubGoal4Rooms], - "TRoom": [DistRewardTRoom, GoalRewardTRoom], + "TRoom": [DistRewardTRoom, GoalRewardTRoom, SubGoalTRoom], "BlockMaze": [DistRewardBlockMaze, GoalRewardBlockMaze], - "Billiard": [DistRewardBilliard, GoalRewardBilliard], + "Billiard": [ + DistRewardBilliard, + GoalRewardBilliard, + SubGoalBilliard, + BanditBilliard, + ], } @staticmethod diff --git a/screenshots/AntFall.png b/screenshots/AntFall.png new file mode 100644 index 0000000..344ddcb Binary files /dev/null and b/screenshots/AntFall.png differ diff --git a/screenshots/AntPush.png b/screenshots/AntPush.png new file mode 100644 index 0000000..3687bb4 Binary files /dev/null and b/screenshots/AntPush.png differ diff --git a/screenshots/Point4Rooms.png b/screenshots/Point4Rooms.png index 41fa577..9f0c163 100644 Binary files a/screenshots/Point4Rooms.png and b/screenshots/Point4Rooms.png differ diff --git a/screenshots/PointBilliard.png b/screenshots/PointBilliard.png new file mode 100644 index 0000000..ec23282 Binary files /dev/null and b/screenshots/PointBilliard.png differ diff --git a/screenshots/PointFall.png b/screenshots/PointFall.png deleted file mode 100644 index f9ce24a..0000000 Binary files a/screenshots/PointFall.png and /dev/null differ diff --git a/screenshots/PointPush.png b/screenshots/PointPush.png deleted file mode 100644 index 003dc7a..0000000 Binary files a/screenshots/PointPush.png and /dev/null differ diff --git a/screenshots/PointUMaze.png b/screenshots/PointUMaze.png index e4cf4e0..40aa637 100644 Binary files a/screenshots/PointUMaze.png and b/screenshots/PointUMaze.png differ diff --git a/tests/test_envs.py b/tests/test_envs.py index d769fd8..92dc9e8 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -36,6 +36,20 @@ def test_point_maze(maze_id): assert r < 0.0 +@pytest.mark.parametrize("maze_id", ["2Rooms", "4Rooms", "Billiard"]) +def test_subgoal_envs(maze_id): + env = gym.make(f"Point{maze_id}-v2") + s0 = env.reset() + s, r, _, _ = env.step(env.action_space.sample()) + if not env.unwrapped.has_extended_obs: + assert s0.shape == (7,) + assert s.shape == (7,) + elif env.unwrapped._observe_balls: + assert s0.shape == (10,) + assert s.shape == (10,) + assert len(env.unwrapped._task.goals) > 1 + + @pytest.mark.parametrize("maze_id", mujoco_maze.TaskRegistry.keys()) def test_reacher_maze(maze_id): for inhibited in ["Fall", "Push", "Block", "Billiard"]: