From 083d6f8dc804dd56e592f74840a5a5b2447497ff Mon Sep 17 00:00:00 2001 From: kngwyu Date: Wed, 24 Jun 2020 18:44:47 +0900 Subject: [PATCH] More precise obs space for point mazes --- mujoco_maze/__init__.py | 1 - mujoco_maze/agent_model.py | 13 ++----- mujoco_maze/ant.py | 64 ++++++++--------------------------- mujoco_maze/ant_maze_env.py | 2 +- mujoco_maze/maze_env.py | 55 ++++++++++++++++++++---------- mujoco_maze/maze_env_utils.py | 5 +-- mujoco_maze/maze_task.py | 4 +++ mujoco_maze/point.py | 57 +++++++++++++++---------------- tests/test_envs.py | 3 +- 9 files changed, 91 insertions(+), 113 deletions(-) diff --git a/mujoco_maze/__init__.py b/mujoco_maze/__init__.py index 2275a06..d390455 100644 --- a/mujoco_maze/__init__.py +++ b/mujoco_maze/__init__.py @@ -2,7 +2,6 @@ import gym from mujoco_maze.maze_task import TaskRegistry - MAZE_IDS = ["Maze", "Push", "Fall", "4Rooms"] # TODO: Block, BlockMaze diff --git a/mujoco_maze/agent_model.py b/mujoco_maze/agent_model.py index 5436c96..a6b95e5 100644 --- a/mujoco_maze/agent_model.py +++ b/mujoco_maze/agent_model.py @@ -1,10 +1,10 @@ """Common API definition for Ant and Point. """ from abc import ABC, abstractmethod + +import numpy as np from gym.envs.mujoco.mujoco_env import MujocoEnv from gym.utils import EzPickle -from mujoco_py import MjSimState -import numpy as np class AgentModel(ABC, MujocoEnv, EzPickle): @@ -15,15 +15,6 @@ class AgentModel(ABC, MujocoEnv, EzPickle): MujocoEnv.__init__(self, file_path, frame_skip) EzPickle.__init__(self) - def set_state_without_forward(self, qpos, qvel): - assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,) - old_state = self.sim.get_state() - new_state = MjSimState( - old_state.time, qpos, qvel, old_state.act, old_state.udd_state - ) - self.sim.set_state(new_state) - self.sim.forward() - @abstractmethod def _get_obs(self) -> np.ndarray: """Returns the observation from the model. diff --git a/mujoco_maze/ant.py b/mujoco_maze/ant.py index 1dcabc5..de58e29 100644 --- a/mujoco_maze/ant.py +++ b/mujoco_maze/ant.py @@ -16,6 +16,8 @@ """Wrapper for creating the ant environment in gym_mujoco.""" import math +from typing import Tuple + import numpy as np from mujoco_maze.agent_model import AgentModel @@ -37,39 +39,23 @@ class AntEnv(AgentModel): FILE = "ant.xml" ORI_IND = 3 - def __init__( - self, - file_path=None, - expose_all_qpos=True, - expose_body_coms=None, - expose_body_comvels=None, - ): - self._expose_all_qpos = expose_all_qpos - self._expose_body_coms = expose_body_coms - self._expose_body_comvels = expose_body_comvels - self._body_com_indices = {} - self._body_comvel_indices = {} - + def __init__(self, file_path: Optional[str] = None) -> None: super().__init__(file_path, 5) - def _step(self, a): - return self.step(a) - - def step(self, a): + def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, dict]: xposbefore = self.get_body_com("torso")[0] - self.do_simulation(a, self.frame_skip) + self.do_simulation(action, self.frame_skip) xposafter = self.get_body_com("torso")[0] forward_reward = (xposafter - xposbefore) / self.dt - ctrl_cost = 0.5 * np.square(a).sum() + ctrl_cost = 0.5 * np.square(action).sum() survive_reward = 1.0 reward = forward_reward - ctrl_cost + survive_reward _ = self.state_vector() - done = False ob = self._get_obs() return ( ob, reward, - done, + False, dict( reward_forward=forward_reward, reward_ctrl=-ctrl_cost, @@ -79,34 +65,12 @@ class AntEnv(AgentModel): def _get_obs(self): # No cfrc observation - if self._expose_all_qpos: - obs = np.concatenate( - [ - self.sim.data.qpos.flat[:15], # Ensures only ant obs. - self.sim.data.qvel.flat[:14], - ] - ) - else: - obs = np.concatenate( - [self.sim.data.qpos.flat[2:15], self.sim.data.qvel.flat[:14],] - ) - - if self._expose_body_coms is not None: - for name in self._expose_body_coms: - com = self.get_body_com(name) - if name not in self._body_com_indices: - indices = range(len(obs), len(obs) + len(com)) - self._body_com_indices[name] = indices - obs = np.concatenate([obs, com]) - - if self._expose_body_comvels is not None: - for name in self._expose_body_comvels: - comvel = self.get_body_comvel(name) - if name not in self._body_comvel_indices: - indices = range(len(obs), len(obs) + len(comvel)) - self._body_comvel_indices[name] = indices - obs = np.concatenate([obs, comvel]) - return obs + return np.concatenate( + [ + self.sim.data.qpos.flat[:15], # Ensures only ant obs. + self.sim.data.qvel.flat[:14], + ] + ) def reset_model(self): qpos = self.init_qpos + self.np_random.uniform( @@ -137,7 +101,7 @@ class AntEnv(AgentModel): qpos[1] = xy[1] qvel = self.sim.data.qvel - self.set_state_without_forwarding(qpos, qvel) + self.set_state(qpos, qvel) def get_xy(self): return np.copy(self.sim.data.qpos[:2]) diff --git a/mujoco_maze/ant_maze_env.py b/mujoco_maze/ant_maze_env.py index 829a842..065414d 100644 --- a/mujoco_maze/ant_maze_env.py +++ b/mujoco_maze/ant_maze_env.py @@ -13,8 +13,8 @@ # limitations under the License. # ============================================================================== -from mujoco_maze.maze_env import MazeEnv from mujoco_maze.ant import AntEnv +from mujoco_maze.maze_env import MazeEnv class AntMazeEnv(MazeEnv): diff --git a/mujoco_maze/maze_env.py b/mujoco_maze/maze_env.py index df3a00f..ad9497b 100644 --- a/mujoco_maze/maze_env.py +++ b/mujoco_maze/maze_env.py @@ -16,17 +16,16 @@ """Adapted from rllab maze_env.py.""" import itertools as it -import numpy as np -import gym import os import tempfile import xml.etree.ElementTree as ET +from typing import Tuple, Type -from typing import Type +import gym +import numpy as np +from mujoco_maze import maze_env_utils, maze_task from mujoco_maze.agent_model import AgentModel -from mujoco_maze import maze_env_utils -from mujoco_maze import maze_task # Directory that contains mujoco xml files. MODEL_DIR = os.path.dirname(os.path.abspath(__file__)) + "/assets" @@ -34,9 +33,7 @@ MODEL_DIR = os.path.dirname(os.path.abspath(__file__)) + "/assets" class MazeEnv(gym.Env): MODEL_CLASS: Type[AgentModel] = AgentModel - MANUAL_COLLISION: bool = False - BLOCK_EPS: float = 0.0001 def __init__( self, @@ -116,7 +113,7 @@ class MazeEnv(gym.Env): x = j * size_scaling - torso_x y = i * size_scaling - torso_y h = height / 2 * size_scaling - size = 0.5 * size_scaling + self.BLOCK_EPS + size = 0.5 * size_scaling ET.SubElement( worldbody, "geom", @@ -135,7 +132,7 @@ class MazeEnv(gym.Env): x = j * size_scaling - torso_x y = i * size_scaling - torso_y h = height / 2 * size_scaling - size = 0.5 * size_scaling + self.BLOCK_EPS + size = 0.5 * size_scaling ET.SubElement( worldbody, "geom", @@ -165,7 +162,7 @@ class MazeEnv(gym.Env): ) y = i * size_scaling - torso_y h = height / 2 * size_scaling * height_shrink - size = 0.5 * size_scaling * shrink + self.BLOCK_EPS + size = 0.5 * size_scaling * shrink movable_body = ET.SubElement( worldbody, "body", @@ -264,10 +261,38 @@ class MazeEnv(gym.Env): tree.write(file_path) self.world_tree = tree self.wrapped_env = self.MODEL_CLASS(*args, file_path=file_path, **kwargs) + self.observation_space = self._get_obs_space() - def get_ori(self): + def get_ori(self) -> float: return self.wrapped_env.get_ori() + def _get_obs_space(self) -> gym.spaces.Box: + shape = self._get_obs().shape + high = np.inf * np.ones(shape) + low = -high + # Set velocity limits + wrapped_obs_space = self.wrapped_env.observation_space + high[: wrapped_obs_space.shape[0]] = wrapped_obs_space.high + low[: wrapped_obs_space.shape[0]] = wrapped_obs_space.low + # Set coordinate limits + low[0], high[0], low[1], high[1] = self._xy_limits() + # Set orientation limits + return gym.spaces.Box(low, high) + + def _xy_limits(self) -> Tuple[float, float, float, float]: + xmin, ymin, xmax, ymax = 100, 100, -100, -100 + structure = self._maze_structure + for i, j in it.product(range(len(structure)), range(len(structure[0]))): + if structure[i][j].is_block(): + continue + xmin, xmax = min(xmin, j), max(xmax, j) + ymin, ymax = min(ymin, i), max(ymax, i) + x0, y0 = self._init_torso_x, self._init_torso_y + scaling = self._maze_size_scaling + xmin, xmax = (xmin - 0.5) * scaling - x0, (xmax + 0.5) * scaling - x0 + ymin, ymax = (ymin - 0.5) * scaling - y0, (ymax + 0.5) * scaling - y0 + return xmin, xmax, ymin, ymax + def get_top_down_view(self): self._view = np.zeros_like(self._view) @@ -492,13 +517,6 @@ class MazeEnv(gym.Env): def render(self, *args, **kwargs): return self.wrapped_env.render(*args, **kwargs) - @property - def observation_space(self): - shape = self._get_obs().shape - high = np.inf * np.ones(shape) - low = -high - return gym.spaces.Box(low, high) - @property def action_space(self): return self.wrapped_env.action_space @@ -531,6 +549,7 @@ class MazeEnv(gym.Env): else: inner_next_obs, inner_reward, _, info = self.wrapped_env.step(action) next_obs = self._get_obs() + inner_reward = self._task.scale_inner_reward(inner_reward) outer_reward = self._task.reward(next_obs) done = self._task.termination(next_obs) return next_obs, inner_reward + outer_reward, done, info diff --git a/mujoco_maze/maze_env_utils.py b/mujoco_maze/maze_env_utils.py index 9e11511..d3c45a7 100644 --- a/mujoco_maze/maze_env_utils.py +++ b/mujoco_maze/maze_env_utils.py @@ -14,9 +14,10 @@ # ============================================================================== """Adapted from rllab maze_env_utils.py.""" -from enum import Enum import itertools as it import math +from enum import Enum + import numpy as np @@ -112,7 +113,7 @@ class Collision: max_x = x_base + size_scaling * offset(pos, 3) self.objects.append((min_y, max_y, min_x, max_x)) - def is_in(self, old_pos, new_pos) -> bool: + def is_in(self, old_pos: np.ndarray, new_pos: np.ndarray) -> bool: # Heuristics to prevent the agent from going through the wall for x, y in ((old_pos + new_pos) / 2, new_pos): for min_y, max_y, min_x, max_x in self.objects: diff --git a/mujoco_maze/maze_task.py b/mujoco_maze/maze_task.py index cd1dc22..b7e4326 100644 --- a/mujoco_maze/maze_task.py +++ b/mujoco_maze/maze_task.py @@ -9,6 +9,7 @@ Rgb = Tuple[float, float, float] RED = (0.7, 0.1, 0.1) GREEN = (0.1, 0.7, 0.1) +BLUE = (0.1, 0.1, 0.7) class MazeGoal: @@ -50,6 +51,9 @@ class MazeTask(ABC): return True return False + def scale_inner_reward(self, inner_reward: float) -> float: + return inner_reward + @abstractmethod def reward(self, obs: np.ndarray) -> float: pass diff --git a/mujoco_maze/point.py b/mujoco_maze/point.py index 96cf53e..8721ca4 100644 --- a/mujoco_maze/point.py +++ b/mujoco_maze/point.py @@ -16,54 +16,53 @@ """Wrapper for creating the ant environment in gym_mujoco.""" import math +from typing import Optional, Tuple + +import gym import numpy as np from mujoco_maze.agent_model import AgentModel class PointEnv(AgentModel): + VELOCITY_LIMITS: float = 100.0 FILE = "point.xml" ORI_IND = 2 - def __init__(self, file_path=None, expose_all_qpos=True): - self._expose_all_qpos = expose_all_qpos + def __init__(self, file_path: Optional[str] = None): super().__init__(file_path, 1) + high = np.inf * np.ones(6) + high[3:] = self.VELOCITY_LIMITS + high[self.ORI_IND] = np.pi + low = -high + self.observation_space = gym.spaces.Box(low, high) - def _step(self, a): - return self.step(a) - - def step(self, action): + def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, dict]: qpos = np.copy(self.sim.data.qpos) qpos[2] += action[1] + # Clip orientation + if qpos[2] < -np.pi: + qpos[2] += np.pi * 2 + elif np.pi < qpos[2]: + qpos[2] -= np.pi * 2 ori = qpos[2] - # compute increment in each direction - dx = math.cos(ori) * action[0] - dy = math.sin(ori) * action[0] - # ensure that the robot is within reasonable range - qpos[0] = np.clip(qpos[0] + dx, -100, 100) - qpos[1] = np.clip(qpos[1] + dy, -100, 100) - qvel = self.sim.data.qvel + # Compute increment in each direction + qpos[0] += math.cos(ori) * action[0] + qpos[1] += math.sin(ori) * action[0] + qvel = np.clip(self.sim.data.qvel, -self.VELOCITY_LIMITS, self.VELOCITY_LIMITS) self.set_state(qpos, qvel) for _ in range(0, self.frame_skip): self.sim.step() next_obs = self._get_obs() - reward = 0 - done = False - info = {} - return next_obs, reward, done, info + return next_obs, 0.0, False, {} def _get_obs(self): - if self._expose_all_qpos: - return np.concatenate( - [ - self.sim.data.qpos.flat[:3], # Only point-relevant coords. - self.sim.data.qvel.flat[:3], - ] - ) - else: - return np.concatenate( - [self.sim.data.qpos.flat[2:3], self.sim.data.qvel.flat[:3]] - ) + return np.concatenate( + [ + self.sim.data.qpos.flat[:3], # Only point-relevant coords. + self.sim.data.qvel.flat[:3], + ] + ) def reset_model(self): qpos = self.init_qpos + self.np_random.uniform( @@ -86,7 +85,7 @@ class PointEnv(AgentModel): qpos[1] = xy[1] qvel = self.sim.data.qvel - self.set_state_without_forward(qpos, qvel) + self.set_state(qpos, qvel) def get_ori(self): return self.sim.data.qpos[self.ORI_IND] diff --git a/tests/test_envs.py b/tests/test_envs.py index 9a5400f..d9a1df8 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -1,7 +1,8 @@ import gym -import mujoco_maze import pytest +import mujoco_maze + @pytest.mark.parametrize("maze_id", mujoco_maze.MAZE_IDS) def test_ant_maze(maze_id):