mujoco_maze/mujoco_maze/maze_task.py

438 lines
12 KiB
Python
Raw Normal View History

"""Maze tasks that are defined by their map, termination condition, and goals.
"""
2020-06-16 06:47:40 +02:00
from abc import ABC, abstractmethod
2020-09-26 11:37:20 +02:00
from typing import Dict, List, NamedTuple, Optional, Tuple, Type
2020-06-16 06:47:40 +02:00
import numpy as np
from mujoco_maze.maze_env_utils import MazeCell
2020-06-22 18:13:05 +02:00
2020-06-30 09:33:07 +02:00
class Rgb(NamedTuple):
red: float
green: float
blue: float
2020-09-29 10:52:46 +02:00
def rgba_str(self) -> str:
return f"{self.red} {self.green} {self.blue} 1"
2020-06-30 09:33:07 +02:00
RED = Rgb(0.7, 0.1, 0.1)
GREEN = Rgb(0.1, 0.7, 0.1)
BLUE = Rgb(0.1, 0.1, 0.7)
2020-06-22 18:13:05 +02:00
2020-06-16 06:47:40 +02:00
class MazeGoal:
2020-06-22 18:13:05 +02:00
def __init__(
2020-09-27 06:33:14 +02:00
self,
pos: np.ndarray,
reward_scale: float = 1.0,
rgb: Rgb = RED,
threshold: float = 0.6,
custom_size: Optional[float] = None,
2020-06-22 18:13:05 +02:00
) -> None:
assert 0.0 <= reward_scale <= 1.0
self.pos = pos
self.dim = pos.shape[0]
2020-06-16 06:47:40 +02:00
self.reward_scale = reward_scale
2020-06-22 18:13:05 +02:00
self.rgb = rgb
2020-09-27 06:33:14 +02:00
self.threshold = threshold
self.custom_size = custom_size
2020-06-22 18:13:05 +02:00
2020-06-16 06:47:40 +02:00
def neighbor(self, obs: np.ndarray) -> float:
2020-09-27 06:33:14 +02:00
return np.linalg.norm(obs[: self.dim] - self.pos) <= self.threshold
2020-06-16 06:47:40 +02:00
def euc_dist(self, obs: np.ndarray) -> float:
2020-06-22 18:13:05 +02:00
return np.sum(np.square(obs[: self.dim] - self.pos)) ** 0.5
2020-06-16 06:47:40 +02:00
class Scaling(NamedTuple):
ant: float
point: float
2020-09-24 16:40:33 +02:00
swimmer: float
2020-06-16 06:47:40 +02:00
class MazeTask(ABC):
REWARD_THRESHOLD: float
2020-09-26 11:37:20 +02:00
PENALTY: Optional[float] = None
2020-09-24 16:40:33 +02:00
MAZE_SIZE_SCALING: Scaling = Scaling(8.0, 4.0, 4.0)
INNER_REWARD_SCALING: float = 0.01
# For Fall/Push/BlockMaze
OBSERVE_BLOCKS: bool = False
# For Billiard
2020-09-27 06:33:14 +02:00
OBSERVE_BALLS: bool = False
OBJECT_BALL_SIZE: float = 1.0
# Unused now
PUT_SPIN_NEAR_AGENT: bool = False
TOP_DOWN_VIEW: bool = False
2020-06-16 06:47:40 +02:00
2020-06-22 18:13:05 +02:00
def __init__(self, scale: float) -> None:
2020-06-16 06:47:40 +02:00
self.goals = []
self.scale = scale
2020-06-16 06:47:40 +02:00
2020-06-22 18:13:05 +02:00
def sample_goals(self) -> bool:
return False
2020-06-16 06:47:40 +02:00
2020-06-22 18:13:05 +02:00
def termination(self, obs: np.ndarray) -> bool:
for goal in self.goals:
if goal.neighbor(obs):
return True
return False
2020-06-16 06:47:40 +02:00
@abstractmethod
2020-06-22 18:13:05 +02:00
def reward(self, obs: np.ndarray) -> float:
2020-06-16 06:47:40 +02:00
pass
@staticmethod
@abstractmethod
def create_maze() -> List[List[MazeCell]]:
pass
class DistRewardMixIn:
REWARD_THRESHOLD: float = -1000.0
goals: List[MazeGoal]
scale: float
def reward(self, obs: np.ndarray) -> float:
return -self.goals[0].euc_dist(obs) / self.scale
2020-09-26 11:37:20 +02:00
class GoalRewardUMaze(MazeTask):
2020-09-24 16:40:33 +02:00
REWARD_THRESHOLD: float = 0.9
2020-09-26 11:37:20 +02:00
PENALTY: float = -0.0001
2020-09-24 16:40:33 +02:00
def __init__(self, scale: float) -> None:
super().__init__(scale)
2020-09-26 11:37:20 +02:00
self.goals = [MazeGoal(np.array([0.0, 2.0 * scale]))]
2020-09-24 16:40:33 +02:00
def reward(self, obs: np.ndarray) -> float:
2020-09-26 11:37:20 +02:00
return 1.0 if self.termination(obs) else self.PENALTY
2020-09-24 16:40:33 +02:00
@staticmethod
def create_maze() -> List[List[MazeCell]]:
E, B, R = MazeCell.EMPTY, MazeCell.BLOCK, MazeCell.ROBOT
return [
[B, B, B, B, B],
[B, R, E, E, B],
2020-09-26 11:37:20 +02:00
[B, B, B, E, B],
[B, E, E, E, B],
2020-09-24 16:40:33 +02:00
[B, B, B, B, B],
]
2020-09-26 11:37:20 +02:00
class DistRewardUMaze(GoalRewardUMaze, DistRewardMixIn):
2020-09-24 16:40:33 +02:00
pass
2020-09-26 11:37:20 +02:00
class GoalRewardSimpleRoom(GoalRewardUMaze):
2020-06-22 18:13:05 +02:00
def __init__(self, scale: float) -> None:
super().__init__(scale)
2020-09-26 11:37:20 +02:00
self.goals = [MazeGoal(np.array([2.0 * scale, 0.0]))]
2020-06-16 06:47:40 +02:00
@staticmethod
def create_maze() -> List[List[MazeCell]]:
E, B, R = MazeCell.EMPTY, MazeCell.BLOCK, MazeCell.ROBOT
return [
[B, B, B, B, B],
[B, R, E, E, B],
[B, B, B, B, B],
]
2020-09-26 11:37:20 +02:00
class DistRewardSimpleRoom(GoalRewardSimpleRoom, DistRewardMixIn):
pass
2020-06-16 06:47:40 +02:00
class GoalRewardPush(GoalRewardUMaze):
OBSERVE_BLOCKS: bool = True
2020-09-21 06:27:41 +02:00
2020-06-22 18:13:05 +02:00
def __init__(self, scale: float) -> None:
super().__init__(scale)
self.goals = [MazeGoal(np.array([0.0, 2.375 * scale]))]
2020-06-16 06:47:40 +02:00
@staticmethod
def create_maze() -> List[List[MazeCell]]:
2020-09-27 06:33:14 +02:00
E, B, R, M = MazeCell.EMPTY, MazeCell.BLOCK, MazeCell.ROBOT, MazeCell.XY_BLOCK
2020-06-16 06:47:40 +02:00
return [
[B, B, B, B, B],
[B, E, R, B, B],
2020-09-27 06:33:14 +02:00
[B, E, M, E, B],
2020-06-16 06:47:40 +02:00
[B, B, E, B, B],
[B, B, B, B, B],
]
class DistRewardPush(GoalRewardPush, DistRewardMixIn):
pass
2020-06-16 06:47:40 +02:00
class GoalRewardFall(GoalRewardUMaze):
OBSERVE_BLOCKS: bool = True
2020-09-21 06:27:41 +02:00
2020-06-22 18:13:05 +02:00
def __init__(self, scale: float) -> None:
super().__init__(scale)
self.goals = [MazeGoal(np.array([0.0, 3.375 * scale, 4.5]))]
2020-06-16 06:47:40 +02:00
@staticmethod
def create_maze() -> List[List[MazeCell]]:
E, B, C, R = MazeCell.EMPTY, MazeCell.BLOCK, MazeCell.CHASM, MazeCell.ROBOT
2020-09-27 06:33:14 +02:00
M = MazeCell.YZ_BLOCK
2020-06-16 06:47:40 +02:00
return [
[B, B, B, B],
[B, R, E, B],
2020-09-27 06:33:14 +02:00
[B, E, M, B],
2020-06-16 06:47:40 +02:00
[B, C, C, B],
[B, E, E, B],
[B, B, B, B],
]
class DistRewardFall(GoalRewardFall, DistRewardMixIn):
pass
2020-06-16 06:47:40 +02:00
class GoalReward2Rooms(MazeTask):
2020-06-29 18:38:02 +02:00
REWARD_THRESHOLD: float = 0.9
2020-09-26 11:37:20 +02:00
PENALTY: float = -0.0001
2020-09-24 16:40:33 +02:00
MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0, 4.0)
2020-06-29 18:38:02 +02:00
def __init__(self, scale: float) -> None:
super().__init__(scale)
self.goals = [MazeGoal(np.array([0.0, 4.0 * scale]))]
def reward(self, obs: np.ndarray) -> float:
2020-06-30 06:17:11 +02:00
for goal in self.goals:
if goal.neighbor(obs):
return goal.reward_scale
2020-09-26 11:37:20 +02:00
return self.PENALTY
2020-06-29 18:38:02 +02:00
@staticmethod
def create_maze() -> List[List[MazeCell]]:
E, B, R = MazeCell.EMPTY, MazeCell.BLOCK, MazeCell.ROBOT
return [
[B, B, B, B, B, B, B, B],
[B, R, E, E, E, E, E, B],
[B, E, E, E, E, E, E, B],
[B, B, B, B, B, E, B, B],
[B, E, E, E, E, E, E, B],
[B, E, E, E, E, E, E, B],
[B, B, B, B, B, B, B, B],
]
class DistReward2Rooms(GoalReward2Rooms, DistRewardMixIn):
pass
2020-06-29 18:38:02 +02:00
class SubGoal2Rooms(GoalReward2Rooms):
2020-06-29 18:38:02 +02:00
def __init__(self, scale: float) -> None:
super().__init__(scale)
self.goals.append(MazeGoal(np.array([5.0 * scale, 0.0 * scale]), 0.5, GREEN))
class GoalReward4Rooms(MazeTask):
2020-06-22 18:13:05 +02:00
REWARD_THRESHOLD: float = 0.9
2020-09-26 11:37:20 +02:00
PENALTY: float = -0.0001
2020-09-24 16:40:33 +02:00
MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0, 4.0)
2020-06-22 18:13:05 +02:00
def __init__(self, scale: float) -> None:
super().__init__(scale)
2020-09-07 10:44:57 +02:00
self.goals = [MazeGoal(np.array([6.0 * scale, -6.0 * scale]))]
2020-06-22 18:13:05 +02:00
def reward(self, obs: np.ndarray) -> float:
for goal in self.goals:
if goal.neighbor(obs):
return goal.reward_scale
2020-09-26 11:37:20 +02:00
return self.PENALTY
2020-06-22 18:13:05 +02:00
@staticmethod
def create_maze() -> List[List[MazeCell]]:
E, B, R = MazeCell.EMPTY, MazeCell.BLOCK, MazeCell.ROBOT
return [
[B, B, B, B, B, B, B, B, B],
2020-09-07 10:44:57 +02:00
[B, E, E, E, B, E, E, E, B],
2020-06-22 18:13:05 +02:00
[B, E, E, E, E, E, E, E, B],
[B, E, E, E, B, E, E, E, B],
[B, B, E, B, B, B, E, B, B],
[B, E, E, E, B, E, E, E, B],
[B, E, E, E, E, E, E, E, B],
2020-09-07 10:44:57 +02:00
[B, R, E, E, B, E, E, E, B],
2020-06-22 18:13:05 +02:00
[B, B, B, B, B, B, B, B, B],
]
class DistReward4Rooms(GoalReward4Rooms, DistRewardMixIn):
pass
2020-06-29 18:38:02 +02:00
class SubGoal4Rooms(GoalReward4Rooms):
2020-06-22 18:13:05 +02:00
def __init__(self, scale: float) -> None:
super().__init__(scale)
2020-06-29 18:38:02 +02:00
self.goals += [
2020-09-07 10:44:57 +02:00
MazeGoal(np.array([0.0 * scale, -6.0 * scale]), 0.5, GREEN),
2020-06-22 18:13:05 +02:00
MazeGoal(np.array([6.0 * scale, 0.0 * scale]), 0.5, GREEN),
]
2020-09-16 18:27:38 +02:00
class GoalRewardTRoom(MazeTask):
REWARD_THRESHOLD: float = 0.9
2020-09-26 11:37:20 +02:00
PENALTY: float = -0.0001
2020-09-24 16:40:33 +02:00
MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0, 4.0)
2020-09-16 18:27:38 +02:00
2020-09-27 06:33:14 +02:00
def __init__(self, scale: float, goal: Tuple[float, float] = (2.0, -3.0)) -> None:
2020-09-16 18:27:38 +02:00
super().__init__(scale)
2020-09-27 06:33:14 +02:00
self.goals = [MazeGoal(np.array(goal) * scale)]
2020-09-16 18:27:38 +02:00
def reward(self, obs: np.ndarray) -> float:
for goal in self.goals:
if goal.neighbor(obs):
return goal.reward_scale
2020-09-26 11:37:20 +02:00
return self.PENALTY
2020-09-16 18:27:38 +02:00
@staticmethod
def create_maze() -> List[List[MazeCell]]:
E, B, R = MazeCell.EMPTY, MazeCell.BLOCK, MazeCell.ROBOT
return [
[B, B, B, B, B, B, B],
[B, E, E, B, E, E, B],
[B, E, E, B, E, E, B],
[B, E, B, B, B, E, B],
[B, E, E, R, E, E, B],
[B, B, B, B, B, B, B],
]
class DistRewardTRoom(GoalRewardTRoom, DistRewardMixIn):
pass
2020-09-26 11:37:20 +02:00
class GoalRewardBlockMaze(GoalRewardUMaze):
OBSERVE_BLOCKS: bool = True
def __init__(self, scale: float) -> None:
super().__init__(scale)
self.goals = [MazeGoal(np.array([0.0, 3.0 * scale]))]
@staticmethod
def create_maze() -> List[List[MazeCell]]:
2020-09-27 06:33:14 +02:00
E, B, R = MazeCell.EMPTY, MazeCell.BLOCK, MazeCell.ROBOT
M = MazeCell.XY_BLOCK
2020-09-26 11:37:20 +02:00
return [
[B, B, B, B, B],
[B, R, E, E, B],
[B, B, B, M, B],
[B, E, E, E, B],
[B, E, E, E, B],
[B, B, B, B, B],
]
class DistRewardBlockMaze(GoalRewardBlockMaze, DistRewardMixIn):
pass
class GoalRewardBilliard(MazeTask):
2020-09-27 06:33:14 +02:00
REWARD_THRESHOLD: float = 0.9
PENALTY: float = -0.0001
MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 3.0, 3.0)
OBSERVE_BALLS: bool = True
2020-09-29 10:52:46 +02:00
GOAL_SIZE: float = 0.3
2020-09-27 06:33:14 +02:00
def __init__(self, scale: float, goal: Tuple[float, float] = (2.0, -3.0)) -> None:
2020-09-27 06:33:14 +02:00
super().__init__(scale)
goal = np.array(goal) * scale
2020-09-29 10:52:46 +02:00
self.goals.append(
MazeGoal(goal, threshold=self._threshold(), custom_size=self.GOAL_SIZE)
)
def _threshold(self) -> float:
return self.OBJECT_BALL_SIZE + self.GOAL_SIZE
2020-09-27 06:33:14 +02:00
def reward(self, obs: np.ndarray) -> float:
2020-09-29 10:52:46 +02:00
object_pos = obs[3:6]
for goal in self.goals:
if goal.neighbor(object_pos):
return goal.reward_scale
return self.PENALTY
2020-09-27 06:33:14 +02:00
def termination(self, obs: np.ndarray) -> bool:
2020-09-29 10:52:46 +02:00
object_pos = obs[3:6]
for goal in self.goals:
if goal.neighbor(object_pos):
return True
return False
2020-09-27 06:33:14 +02:00
@staticmethod
def create_maze() -> List[List[MazeCell]]:
E, B = MazeCell.EMPTY, MazeCell.BLOCK
R, M = MazeCell.ROBOT, MazeCell.OBJECT_BALL
return [
[B, B, B, B, B, B, B],
[B, E, E, E, E, E, B],
[B, E, E, E, E, E, B],
[B, E, E, M, E, E, B],
[B, E, E, R, E, E, B],
[B, B, B, B, B, B, B],
2020-09-27 06:33:14 +02:00
]
class DistRewardBilliard(GoalRewardBilliard):
2020-09-27 06:33:14 +02:00
def reward(self, obs: np.ndarray) -> float:
return -self.goals[0].euc_dist(obs[3:6]) / self.scale
2020-09-29 10:52:46 +02:00
class SubGoalBilliard(GoalRewardBilliard):
def __init__(
self,
scale: float,
primary_goal: Tuple[float, float] = (2.0, -3.0),
subgoal: Tuple[float, float] = (-2.0, -3.0),
) -> None:
super().__init__(scale, primary_goal)
self.goals.append(
MazeGoal(
np.array(subgoal) * scale,
reward_scale=0.5,
rgb=GREEN,
threshold=self._threshold(),
custom_size=self.GOAL_SIZE,
)
)
@staticmethod
def create_maze() -> List[List[MazeCell]]:
E, B = MazeCell.EMPTY, MazeCell.BLOCK
R, M = MazeCell.ROBOT, MazeCell.OBJECT_BALL
return [
[B, B, B, B, B, B, B],
[B, E, E, E, E, E, B],
[B, E, E, E, B, B, B],
[B, E, E, M, E, E, B],
[B, E, E, R, E, E, B],
[B, B, B, B, B, B, B],
]
2020-06-16 06:47:40 +02:00
class TaskRegistry:
REGISTRY: Dict[str, List[Type[MazeTask]]] = {
2020-09-24 16:40:33 +02:00
"SimpleRoom": [DistRewardSimpleRoom, GoalRewardSimpleRoom],
"UMaze": [DistRewardUMaze, GoalRewardUMaze],
"Push": [DistRewardPush, GoalRewardPush],
"Fall": [DistRewardFall, GoalRewardFall],
"2Rooms": [DistReward2Rooms, GoalReward2Rooms, SubGoal2Rooms],
"4Rooms": [DistReward4Rooms, GoalReward4Rooms, SubGoal4Rooms],
2020-09-16 18:27:38 +02:00
"TRoom": [DistRewardTRoom, GoalRewardTRoom],
2020-09-26 11:37:20 +02:00
"BlockMaze": [DistRewardBlockMaze, GoalRewardBlockMaze],
2020-09-29 10:52:46 +02:00
"Billiard": [DistRewardBilliard, GoalRewardBilliard, SubGoalBilliard],
2020-06-16 06:47:40 +02:00
}
2020-06-29 18:38:02 +02:00
@staticmethod
def keys() -> List[str]:
return list(TaskRegistry.REGISTRY.keys())
@staticmethod
def tasks(key: str) -> List[Type[MazeTask]]:
return TaskRegistry.REGISTRY[key]