Add INNER_REWARD_SCALING and More flexible reward for ant

This commit is contained in:
kngwyu 2020-07-01 12:42:40 +09:00
parent d5cc345080
commit 91249105b8
5 changed files with 63 additions and 37 deletions

View File

@ -5,11 +5,14 @@ from mujoco_maze.maze_task import TaskRegistry
for maze_id in TaskRegistry.keys(): for maze_id in TaskRegistry.keys():
for i, task_cls in enumerate(TaskRegistry.tasks(maze_id)): for i, task_cls in enumerate(TaskRegistry.tasks(maze_id)):
scaling = task_cls.SCALING.ant
gym.envs.register( gym.envs.register(
id=f"Ant{maze_id}-v{i}", id=f"Ant{maze_id}-v{i}",
entry_point="mujoco_maze.ant_maze_env:AntMazeEnv", entry_point="mujoco_maze.ant_maze_env:AntMazeEnv",
kwargs=dict(maze_task=task_cls, maze_size_scaling=scaling), kwargs=dict(
maze_task=task_cls,
maze_size_scaling=task_cls.MAZE_SIZE_SCALING.ant,
inner_reward_scaling=task_cls.INNER_REWARD_SCALING,
),
max_episode_steps=1000, max_episode_steps=1000,
reward_threshold=task_cls.REWARD_THRESHOLD, reward_threshold=task_cls.REWARD_THRESHOLD,
) )
@ -19,7 +22,11 @@ for maze_id in TaskRegistry.keys():
gym.envs.register( gym.envs.register(
id=f"Point{maze_id}-v{i}", id=f"Point{maze_id}-v{i}",
entry_point="mujoco_maze.point_maze_env:PointMazeEnv", entry_point="mujoco_maze.point_maze_env:PointMazeEnv",
kwargs=dict(maze_task=task_cls, maze_size_scaling=scaling), kwargs=dict(
maze_task=task_cls,
maze_size_scaling=task_cls.MAZE_SIZE_SCALING.point,
inner_reward_scaling=task_cls.INNER_REWARD_SCALING,
),
max_episode_steps=1000, max_episode_steps=1000,
reward_threshold=task_cls.REWARD_THRESHOLD, reward_threshold=task_cls.REWARD_THRESHOLD,
) )

View File

@ -16,13 +16,24 @@
"""Wrapper for creating the ant environment in gym_mujoco.""" """Wrapper for creating the ant environment in gym_mujoco."""
import math import math
from typing import Optional, Tuple from typing import Callable, Optional, Tuple
import numpy as np import numpy as np
from mujoco_maze.agent_model import AgentModel from mujoco_maze.agent_model import AgentModel
ForwardRewardFn = Callable[[float, float], float]
def forward_reward_vabs(xy_velocity: float) -> float:
return np.sum(np.abs(xy_velocity))
def forward_reward_vnorm(xy_velocity: float) -> float:
return np.linalg.norm(xy_velocity)
def q_inv(a): def q_inv(a):
return [a[0], -a[1], -a[2], -a[3]] return [a[0], -a[1], -a[2], -a[3]]
@ -36,31 +47,37 @@ def q_mult(a, b): # multiply two quaternion
class AntEnv(AgentModel): class AntEnv(AgentModel):
FILE = "ant.xml" FILE: str = "ant.xml"
ORI_IND = 3 ORI_IND: int = 3
def __init__(self, file_path: Optional[str] = None) -> None: def __init__(
self,
file_path: Optional[str] = None,
ctrl_cost_weight: float = 0.5,
forward_reward_fn: ForwardRewardFn = forward_reward_vnorm,
) -> None:
self._ctrl_cost_weight = ctrl_cost_weight
self._forward_reward_fn = forward_reward_fn
super().__init__(file_path, 5) super().__init__(file_path, 5)
def _forward_reward(self, xy_pos_before: np.ndarray) -> float:
xy_pos_after = self.sim.data.qpos[:2].copy()
xy_velocity = (xy_pos_after - xy_pos_before) / self.dt
return self._forward_reward_fn(xy_velocity)
def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, dict]: def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, dict]:
xposbefore = self.get_body_com("torso")[0] xy_pos_before = self.sim.data.qpos[:2].copy()
self.do_simulation(action, self.frame_skip) self.do_simulation(action, self.frame_skip)
xposafter = self.get_body_com("torso")[0]
forward_reward = (xposafter - xposbefore) / self.dt forward_reward = self._forward_reward(xy_pos_before)
ctrl_cost = 0.5 * np.square(action).sum() ctrl_cost = self._ctrl_cost_weight * np.square(action).sum()
survive_reward = 1.0
reward = forward_reward - ctrl_cost + survive_reward
_ = self.state_vector()
ob = self._get_obs() ob = self._get_obs()
return ( return (
ob, ob,
reward, forward_reward - ctrl_cost,
False, False,
dict( dict(reward_forward=forward_reward, reward_ctrl=-ctrl_cost,),
reward_forward=forward_reward,
reward_ctrl=-ctrl_cost,
reward_survive=survive_reward,
),
) )
def _get_obs(self): def _get_obs(self):
@ -84,9 +101,6 @@ class AntEnv(AgentModel):
self.set_state(qpos, qvel) self.set_state(qpos, qvel)
return self._get_obs() return self._get_obs()
def viewer_setup(self):
self.viewer.cam.distance = self.model.stat.extent * 0.5
def get_ori(self): def get_ori(self):
ori = [0, 1, 0, 0] ori = [0, 1, 0, 0]
ori_ind = self.ORI_IND ori_ind = self.ORI_IND
@ -96,7 +110,7 @@ class AntEnv(AgentModel):
return ori return ori
def set_xy(self, xy): def set_xy(self, xy):
qpos = np.copy(self.sim.data.qpos) qpos = self.sim.data.qpos.copy()
qpos[0] = xy[0] qpos[0] = xy[0]
qpos[1] = xy[1] qpos[1] = xy[1]

View File

@ -44,6 +44,7 @@ class MazeEnv(gym.Env):
top_down_view: float = False, top_down_view: float = False,
maze_height: float = 0.5, maze_height: float = 0.5,
maze_size_scaling: float = 4.0, maze_size_scaling: float = 4.0,
inner_reward_scaling: float = 1.0,
*args, *args,
**kwargs, **kwargs,
) -> None: ) -> None:
@ -55,6 +56,7 @@ class MazeEnv(gym.Env):
self._maze_height = height = maze_height self._maze_height = height = maze_height
self._maze_size_scaling = size_scaling = maze_size_scaling self._maze_size_scaling = size_scaling = maze_size_scaling
self._inner_reward_scaling = inner_reward_scaling
self.t = 0 # time steps self.t = 0 # time steps
self._n_bins = n_bins self._n_bins = n_bins
self._sensor_range = sensor_range * size_scaling self._sensor_range = sensor_range * size_scaling
@ -495,9 +497,10 @@ class MazeEnv(gym.Env):
def reset(self): def reset(self):
self.t = 0 self.t = 0
self.wrapped_env.reset() self.wrapped_env.reset()
# Sample a new goal # Samples a new goal
if self._task.sample_goals(): if self._task.sample_goals():
self.set_marker() self.set_marker()
# Samples a new start position
if len(self._init_positions) > 1: if len(self._init_positions) > 1:
xy = np.random.choice(self._init_positions) xy = np.random.choice(self._init_positions)
self.wrapped_env.set_xy(xy) self.wrapped_env.set_xy(xy)
@ -547,7 +550,7 @@ class MazeEnv(gym.Env):
else: else:
inner_next_obs, inner_reward, _, info = self.wrapped_env.step(action) inner_next_obs, inner_reward, _, info = self.wrapped_env.step(action)
next_obs = self._get_obs() next_obs = self._get_obs()
inner_reward = self._task.scale_inner_reward(inner_reward) inner_reward = self._inner_reward_scaling * inner_reward
outer_reward = self._task.reward(next_obs) outer_reward = self._task.reward(next_obs)
done = self._task.termination(next_obs) done = self._task.termination(next_obs)
return next_obs, inner_reward + outer_reward, done, info return next_obs, inner_reward + outer_reward, done, info

View File

@ -47,7 +47,8 @@ class Scaling(NamedTuple):
class MazeTask(ABC): class MazeTask(ABC):
REWARD_THRESHOLD: float REWARD_THRESHOLD: float
SCALING: Scaling = Scaling(8.0, 4.0) MAZE_SIZE_SCALING: Scaling = Scaling(8.0, 4.0)
INNER_REWARD_SCALING: float = 1e-4
OBSERVE_BLOCKS: bool = False OBSERVE_BLOCKS: bool = False
PUT_SPIN_NEAR_AGENT: bool = False PUT_SPIN_NEAR_AGENT: bool = False
@ -64,9 +65,6 @@ class MazeTask(ABC):
return True return True
return False return False
def scale_inner_reward(self, inner_reward: float) -> float:
return inner_reward
@abstractmethod @abstractmethod
def reward(self, obs: np.ndarray) -> float: def reward(self, obs: np.ndarray) -> float:
pass pass
@ -102,6 +100,7 @@ class SingleGoalSparseUMaze(MazeTask):
class SingleGoalDenseUMaze(SingleGoalSparseUMaze): class SingleGoalDenseUMaze(SingleGoalSparseUMaze):
REWARD_THRESHOLD: float = 1000.0 REWARD_THRESHOLD: float = 1000.0
def reward(self, obs: np.ndarray) -> float: def reward(self, obs: np.ndarray) -> float:
return -self.goals[0].euc_dist(obs) return -self.goals[0].euc_dist(obs)
@ -126,6 +125,7 @@ class SingleGoalSparsePush(SingleGoalSparseUMaze):
class SingleGoalDensePush(SingleGoalSparsePush): class SingleGoalDensePush(SingleGoalSparsePush):
REWARD_THRESHOLD: float = 1000.0 REWARD_THRESHOLD: float = 1000.0
def reward(self, obs: np.ndarray) -> float: def reward(self, obs: np.ndarray) -> float:
return -self.goals[0].euc_dist(obs) return -self.goals[0].euc_dist(obs)
@ -151,12 +151,14 @@ class SingleGoalSparseFall(SingleGoalSparseUMaze):
class SingleGoalDenseFall(SingleGoalSparseFall): class SingleGoalDenseFall(SingleGoalSparseFall):
REWARD_THRESHOLD: float = 1000.0 REWARD_THRESHOLD: float = 1000.0
def reward(self, obs: np.ndarray) -> float: def reward(self, obs: np.ndarray) -> float:
return -self.goals[0].euc_dist(obs) return -self.goals[0].euc_dist(obs)
class SingleGoalSparse2Rooms(MazeTask): class SingleGoalSparse2Rooms(MazeTask):
REWARD_THRESHOLD: float = 0.9 REWARD_THRESHOLD: float = 0.9
SCALING: Scaling = Scaling(4.0, 4.0)
def __init__(self, scale: float) -> None: def __init__(self, scale: float) -> None:
super().__init__(scale) super().__init__(scale)
@ -185,6 +187,7 @@ class SingleGoalSparse2Rooms(MazeTask):
class SingleGoalDense2Rooms(SingleGoalSparse2Rooms): class SingleGoalDense2Rooms(SingleGoalSparse2Rooms):
REWARD_THRESHOLD: float = 1000.0 REWARD_THRESHOLD: float = 1000.0
def reward(self, obs: np.ndarray) -> float: def reward(self, obs: np.ndarray) -> float:
return -self.goals[0].euc_dist(obs) return -self.goals[0].euc_dist(obs)
@ -197,6 +200,7 @@ class SubGoalSparse2Rooms(SingleGoalSparse2Rooms):
class SingleGoalSparse4Rooms(MazeTask): class SingleGoalSparse4Rooms(MazeTask):
REWARD_THRESHOLD: float = 0.9 REWARD_THRESHOLD: float = 0.9
MAZE_SIZE_SCALING: Scaling = Scaling(4.0, 4.0)
def __init__(self, scale: float) -> None: def __init__(self, scale: float) -> None:
super().__init__(scale) super().__init__(scale)
@ -227,6 +231,7 @@ class SingleGoalSparse4Rooms(MazeTask):
class SingleGoalDense4Rooms(SingleGoalSparse4Rooms): class SingleGoalDense4Rooms(SingleGoalSparse4Rooms):
REWARD_THRESHOLD: float = 1000.0 REWARD_THRESHOLD: float = 1000.0
def reward(self, obs: np.ndarray) -> float: def reward(self, obs: np.ndarray) -> float:
return -self.goals[0].euc_dist(obs) return -self.goals[0].euc_dist(obs)
@ -245,11 +250,7 @@ class TaskRegistry:
"UMaze": [SingleGoalDenseUMaze, SingleGoalSparseUMaze], "UMaze": [SingleGoalDenseUMaze, SingleGoalSparseUMaze],
"Push": [SingleGoalDensePush, SingleGoalSparsePush], "Push": [SingleGoalDensePush, SingleGoalSparsePush],
"Fall": [SingleGoalDenseFall, SingleGoalSparseFall], "Fall": [SingleGoalDenseFall, SingleGoalSparseFall],
"2Rooms": [ "2Rooms": [SingleGoalDense2Rooms, SingleGoalSparse2Rooms, SubGoalSparse2Rooms,],
SingleGoalDense2Rooms,
SingleGoalSparse2Rooms,
SubGoalSparse2Rooms,
],
"4Rooms": [SingleGoalSparse4Rooms, SingleGoalDense4Rooms, SubGoalSparse4Rooms], "4Rooms": [SingleGoalSparse4Rooms, SingleGoalDense4Rooms, SubGoalSparse4Rooms],
} }

View File

@ -25,9 +25,10 @@ from mujoco_maze.agent_model import AgentModel
class PointEnv(AgentModel): class PointEnv(AgentModel):
FILE: str = "point.xml"
ORI_IND: int = 2
VELOCITY_LIMITS: float = 10.0 VELOCITY_LIMITS: float = 10.0
FILE = "point.xml"
ORI_IND = 2
def __init__(self, file_path: Optional[str] = None): def __init__(self, file_path: Optional[str] = None):
super().__init__(file_path, 1) super().__init__(file_path, 1)