From d2c661d55cb753c53d362ad4660ea9bcad335d48 Mon Sep 17 00:00:00 2001 From: kngwyu Date: Thu, 24 Sep 2020 23:40:33 +0900 Subject: [PATCH] Swimmer --- mujoco_maze/__init__.py | 23 +++++++++-- mujoco_maze/agent_model.py | 11 ++--- mujoco_maze/ant.py | 27 ++++++------- mujoco_maze/assets/point.xml | 1 + mujoco_maze/assets/swimmer.xml | 39 ++++++++++++++++++ mujoco_maze/maze_env.py | 14 ++++--- mujoco_maze/maze_env_utils.py | 1 + mujoco_maze/maze_task.py | 40 +++++++++++++++---- mujoco_maze/swimmer.py | 73 ++++++++++++++++++++++++++++++++++ tests/test_envs.py | 13 ++++++ 10 files changed, 205 insertions(+), 37 deletions(-) create mode 100644 mujoco_maze/assets/swimmer.xml create mode 100644 mujoco_maze/swimmer.py diff --git a/mujoco_maze/__init__.py b/mujoco_maze/__init__.py index 9acacab..652f7ee 100644 --- a/mujoco_maze/__init__.py +++ b/mujoco_maze/__init__.py @@ -11,9 +11,11 @@ import gym from mujoco_maze.ant import AntEnv from mujoco_maze.maze_task import TaskRegistry from mujoco_maze.point import PointEnv +from mujoco_maze.swimmer import SwimmerEnv for maze_id in TaskRegistry.keys(): for i, task_cls in enumerate(TaskRegistry.tasks(maze_id)): + # Ant gym.envs.register( id=f"Ant{maze_id}-v{i}", entry_point="mujoco_maze.maze_env:MazeEnv", @@ -26,9 +28,7 @@ for maze_id in TaskRegistry.keys(): max_episode_steps=1000, reward_threshold=task_cls.REWARD_THRESHOLD, ) - -for maze_id in TaskRegistry.keys(): - for i, task_cls in enumerate(TaskRegistry.tasks(maze_id)): + # Point gym.envs.register( id=f"Point{maze_id}-v{i}", entry_point="mujoco_maze.maze_env:MazeEnv", @@ -42,5 +42,22 @@ for maze_id in TaskRegistry.keys(): reward_threshold=task_cls.REWARD_THRESHOLD, ) + if "Push" in maze_id or "Fall" in maze_id: + continue + + # Swimmer + gym.envs.register( + id=f"Swimmer{maze_id}-v{i}", + entry_point="mujoco_maze.maze_env:MazeEnv", + kwargs=dict( + model_cls=SwimmerEnv, + maze_task=task_cls, + maze_size_scaling=task_cls.MAZE_SIZE_SCALING.swimmer, + inner_reward_scaling=task_cls.INNER_REWARD_SCALING, + ), + max_episode_steps=1000, + reward_threshold=task_cls.REWARD_THRESHOLD, + ) + __version__ = "0.1.0" diff --git a/mujoco_maze/agent_model.py b/mujoco_maze/agent_model.py index cefaa49..d209d78 100644 --- a/mujoco_maze/agent_model.py +++ b/mujoco_maze/agent_model.py @@ -1,6 +1,7 @@ """Common APIs for defining mujoco robot. """ from abc import ABC, abstractmethod +from typing import Optional import numpy as np from gym.envs.mujoco.mujoco_env import MujocoEnv @@ -9,9 +10,9 @@ from gym.utils import EzPickle class AgentModel(ABC, MujocoEnv, EzPickle): FILE: str - ORI_IND: int MANUAL_COLLISION: bool - RADIUS: float + ORI_IND: int + RADIUS: Optional[float] = None def __init__(self, file_path: str, frame_skip: int) -> None: MujocoEnv.__init__(self, file_path, frame_skip) @@ -30,18 +31,12 @@ class AgentModel(ABC, MujocoEnv, EzPickle): """ pass - @abstractmethod def get_xy(self) -> np.ndarray: """Returns the coordinate of the agent. """ pass - @abstractmethod def set_xy(self, xy: np.ndarray) -> None: """Set the coordinate of the agent. """ pass - - @abstractmethod - def get_ori(self) -> float: - pass diff --git a/mujoco_maze/ant.py b/mujoco_maze/ant.py index cc7c441..34356e1 100644 --- a/mujoco_maze/ant.py +++ b/mujoco_maze/ant.py @@ -6,7 +6,7 @@ Based on `models`_ and `gym`_ (both ant and ant-v3). .. _gym: https://github.com/openai/gym """ -from typing import Callable, Optional, Tuple +from typing import Callable, Tuple import numpy as np @@ -39,14 +39,15 @@ class AntEnv(AgentModel): FILE: str = "ant.xml" ORI_IND: int = 3 MANUAL_COLLISION: bool = False - RADIUS: float = 0.2 def __init__( self, - file_path: Optional[str] = None, - ctrl_cost_weight: float = 0.0001, + file_path: str, + forward_reward_weight: float = 1.0, + ctrl_cost_weight: float = 1e-4, forward_reward_fn: ForwardRewardFn = forward_reward_vnorm, ) -> None: + self._forward_reward_weight = forward_reward_weight self._ctrl_cost_weight = ctrl_cost_weight self._forward_reward_fn = forward_reward_fn super().__init__(file_path, 5) @@ -63,12 +64,11 @@ class AntEnv(AgentModel): forward_reward = self._forward_reward(xy_pos_before) ctrl_cost = self._ctrl_cost_weight * np.square(action).sum() - ob = self._get_obs() return ( - ob, - forward_reward - ctrl_cost, + self._get_obs(), + self._forward_reward_weight * forward_reward - ctrl_cost, False, - dict(reward_forward=forward_reward, reward_ctrl=-ctrl_cost,), + dict(reward_forward=forward_reward, reward_ctrl=-ctrl_cost), ) def _get_obs(self): @@ -82,7 +82,7 @@ class AntEnv(AgentModel): def reset_model(self): qpos = self.init_qpos + self.np_random.uniform( - size=self.model.nq, low=-0.1, high=0.1 + size=self.model.nq, low=-0.1, high=0.1, ) qvel = self.init_qvel + self.np_random.randn(self.model.nv) * 0.1 @@ -92,18 +92,17 @@ class AntEnv(AgentModel): self.set_state(qpos, qvel) return self._get_obs() - def get_ori(self): + def get_ori(self) -> np.ndarray: ori = [0, 1, 0, 0] - ori_ind = self.ORI_IND - rot = self.sim.data.qpos[ori_ind : ori_ind + 4] # take the quaternion + rot = self.sim.data.qpos[self.ORI_IND : self.ORI_IND + 4] # take the quaternion ori = q_mult(q_mult(rot, ori), q_inv(rot))[1:3] # project onto x-y plane ori = np.arctan2(ori[1], ori[0]) return ori - def set_xy(self, xy): + def set_xy(self, xy: np.ndarray) -> None: qpos = self.sim.data.qpos.copy() qpos[:2] = xy self.set_state(qpos, self.sim.data.qvel) - def get_xy(self): + def get_xy(self) -> np.ndarray: return np.copy(self.sim.data.qpos[:2]) diff --git a/mujoco_maze/assets/point.xml b/mujoco_maze/assets/point.xml index 2d90f75..e0d3302 100755 --- a/mujoco_maze/assets/point.xml +++ b/mujoco_maze/assets/point.xml @@ -15,6 +15,7 @@ + diff --git a/mujoco_maze/assets/swimmer.xml b/mujoco_maze/assets/swimmer.xml new file mode 100644 index 0000000..1a50dfc --- /dev/null +++ b/mujoco_maze/assets/swimmer.xml @@ -0,0 +1,39 @@ + + + diff --git a/mujoco_maze/maze_env.py b/mujoco_maze/maze_env.py index 31918f3..8c0febd 100644 --- a/mujoco_maze/maze_env.py +++ b/mujoco_maze/maze_env.py @@ -64,9 +64,14 @@ class MazeEnv(gym.Env): (x - torso_x, y - torso_y) for x, y in self._find_all_robots() ] - self._collision = maze_env_utils.CollisionDetector( - structure, size_scaling, torso_x, torso_y, model_cls.RADIUS, - ) + if model_cls.MANUAL_COLLISION: + if model_cls.RADIUS is None: + raise ValueError("Manual collision needs radius of the model") + self._collision = maze_env_utils.CollisionDetector( + structure, size_scaling, torso_x, torso_y, model_cls.RADIUS, + ) + else: + self._collision = None self._xy_to_rowcol = lambda x, y: ( 2 + (y + size_scaling / 2) / size_scaling, @@ -226,7 +231,7 @@ class MazeEnv(gym.Env): geoms = torso.findall(".//geom") for geom in geoms: if "name" not in geom.attrib: - raise Exception("Every geom of the torso must have a name " "defined") + raise Exception("Every geom of the torso must have a name") # Set goals asset = tree.find(".//asset") @@ -344,7 +349,6 @@ class MazeEnv(gym.Env): robot_x, robot_y = self.wrapped_env.get_body_com("torso")[:2] self._robot_x = robot_x self._robot_y = robot_y - self._robot_ori = self.get_ori() structure = self._maze_structure size_scaling = self._maze_size_scaling diff --git a/mujoco_maze/maze_env_utils.py b/mujoco_maze/maze_env_utils.py index a44830a..98280ef 100644 --- a/mujoco_maze/maze_env_utils.py +++ b/mujoco_maze/maze_env_utils.py @@ -138,6 +138,7 @@ class Collision: class CollisionDetector: """For manual collision detection. """ + EPS: float = 0.05 NEIGHBORS: List[Tuple[int, int]] = [[0, -1], [-1, 0], [0, 1], [1, 0]] diff --git a/mujoco_maze/maze_task.py b/mujoco_maze/maze_task.py index 9d36a1e..b2be5dc 100644 --- a/mujoco_maze/maze_task.py +++ b/mujoco_maze/maze_task.py @@ -46,11 +46,12 @@ class MazeGoal: class Scaling(NamedTuple): ant: float point: float + swimmer: float class MazeTask(ABC): REWARD_THRESHOLD: float - MAZE_SIZE_SCALING: Scaling = Scaling(8.0, 4.0) + MAZE_SIZE_SCALING: Scaling = Scaling(8.0, 4.0, 4.0) INNER_REWARD_SCALING: float = 0.01 TOP_DOWN_VIEW: bool = False OBSERVE_BLOCKS: bool = False @@ -88,6 +89,32 @@ class DistRewardMixIn: return -self.goals[0].euc_dist(obs) / self.scale +class GoalRewardSimpleRoom(MazeTask): + """ Very easy task. For testing. + """ + REWARD_THRESHOLD: float = 0.9 + + def __init__(self, scale: float) -> None: + super().__init__(scale) + self.goals = [MazeGoal(np.array([2.0 * scale, 0.0]))] + + def reward(self, obs: np.ndarray) -> float: + return 1.0 if self.termination(obs) else -0.0001 + + @staticmethod + def create_maze() -> List[List[MazeCell]]: + E, B, R = MazeCell.EMPTY, MazeCell.BLOCK, MazeCell.ROBOT + return [ + [B, B, B, B, B], + [B, R, E, E, B], + [B, B, B, B, B], + ] + + +class DistRewardSimpleRoom(GoalRewardSimpleRoom, DistRewardMixIn): + pass + + class GoalRewardUMaze(MazeTask): REWARD_THRESHOLD: float = 0.9 @@ -163,7 +190,7 @@ class DistRewardFall(GoalRewardFall, DistRewardMixIn): class GoalReward2Rooms(MazeTask): REWARD_THRESHOLD: float = 0.9 - MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0) + MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0, 4.0) def __init__(self, scale: float) -> None: super().__init__(scale) @@ -201,7 +228,7 @@ class SubGoal2Rooms(GoalReward2Rooms): class GoalReward4Rooms(MazeTask): REWARD_THRESHOLD: float = 0.9 - MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0) + MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0, 4.0) def __init__(self, scale: float) -> None: super().__init__(scale) @@ -244,12 +271,10 @@ class SubGoal4Rooms(GoalReward4Rooms): class GoalRewardTRoom(MazeTask): REWARD_THRESHOLD: float = 0.9 - MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0) + MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0, 4.0) def __init__( - self, - scale: float, - goals: List[Tuple[float, float]] = [(2.0, -3.0)], + self, scale: float, goals: List[Tuple[float, float]] = [(2.0, -3.0)], ) -> None: super().__init__(scale) self.goals = [] @@ -281,6 +306,7 @@ class DistRewardTRoom(GoalRewardTRoom, DistRewardMixIn): class TaskRegistry: REGISTRY: Dict[str, List[Type[MazeTask]]] = { + "SimpleRoom": [DistRewardSimpleRoom, GoalRewardSimpleRoom], "UMaze": [DistRewardUMaze, GoalRewardUMaze], "Push": [DistRewardPush, GoalRewardPush], "Fall": [DistRewardFall, GoalRewardFall], diff --git a/mujoco_maze/swimmer.py b/mujoco_maze/swimmer.py new file mode 100644 index 0000000..ef56e89 --- /dev/null +++ b/mujoco_maze/swimmer.py @@ -0,0 +1,73 @@ +""" +Swimmer robot as an explorer in the maze. +Based on `gym`_ (swimmer-v3). + +.. _gym: https://github.com/openai/gym +""" + +from typing import Tuple + +import numpy as np + +from mujoco_maze.agent_model import AgentModel +from mujoco_maze.ant import ForwardRewardFn, forward_reward_vnorm + + +class SwimmerEnv(AgentModel): + FILE: str = "swimmer.xml" + MANUAL_COLLISION: bool = False + + def __init__( + self, + file_path: str = None, + forward_reward_weight: float = 1.0, + ctrl_cost_weight: float = 1e-4, + forward_reward_fn: ForwardRewardFn = forward_reward_vnorm, + ) -> None: + self._forward_reward_weight = forward_reward_weight + self._ctrl_cost_weight = ctrl_cost_weight + self._forward_reward_fn = forward_reward_fn + super().__init__(file_path, 4) + + def _forward_reward(self, xy_pos_before: np.ndarray) -> Tuple[float, np.ndarray]: + xy_pos_after = self.sim.data.qpos[:2].copy() + xy_velocity = (xy_pos_after - xy_pos_before) / self.dt + return self._forward_reward_fn(xy_velocity) + + def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, dict]: + xy_pos_before = self.sim.data.qpos[:2].copy() + self.do_simulation(action, self.frame_skip) + + forward_reward = self._forward_reward(xy_pos_before) + ctrl_cost = self._ctrl_cost_weight * np.sum(np.square(action)) + return ( + self._get_obs(), + self._forward_reward_weight * forward_reward - ctrl_cost, + False, + dict(reward_forward=forward_reward, reward_ctrl=-ctrl_cost), + ) + + def _get_obs(self) -> np.ndarray: + position = self.sim.data.qpos.flat.copy() + velocity = self.sim.data.qvel.flat.copy() + observation = np.concatenate([position, velocity]).ravel() + return observation + + def reset_model(self) -> np.ndarray: + qpos = self.init_qpos + self.np_random.uniform( + low=-0.1, high=0.1, size=self.model.nq, + ) + qvel = self.init_qvel + self.np_random.uniform( + low=-0.1, high=0.1, size=self.model.nv, + ) + + self.set_state(qpos, qvel) + return self._get_obs() + + def set_xy(self, xy: np.ndarray) -> None: + qpos = self.sim.data.qpos.copy() + qpos[:2] = xy + self.set_state(qpos, self.sim.data.qvel) + + def get_xy(self) -> np.ndarray: + return np.copy(self.sim.data.qpos[:2]) diff --git a/tests/test_envs.py b/tests/test_envs.py index 6e925a1..1cf331a 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -26,6 +26,19 @@ def test_point_maze(maze_id): assert s.shape == (7,) +@pytest.mark.parametrize("maze_id", mujoco_maze.TaskRegistry.keys()) +def test_swimmer_maze(maze_id): + if "Fall" in maze_id or "Push" in maze_id: + return + for i in range(2): + env = gym.make(f"Swimmer{maze_id}-v{i}") + s0 = env.reset() + s, _, _, _ = env.step(env.action_space.sample()) + if not env.unwrapped._top_down_view: + assert s0.shape == (11,) + assert s.shape == (11,) + + @pytest.mark.parametrize("v", [0, 1]) def test_maze_args(v): env = gym.make(f"PointTRoom-v{v}", task_kwargs={"goals": [(-2.0, -3.0)]})