- Add NuconGoalEnv for goal-conditioned HER training (SAC + HER) - Add UncertaintyPenalty and UncertaintyAbort composable callables; SIM_UNCERTAINTY injected into obs dict when simulator is active - Fix rl.py: str-typed params crash, missing Enum import, write-only params in action space, broken step() iteration order - Remove uncertainty state from sim (return value from update() instead) - Rename kNN -> kNN-GP throughout README; add model selection note Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
467 lines
18 KiB
Python
467 lines
18 KiB
Python
import inspect
|
|
import gymnasium as gym
|
|
from gymnasium import spaces
|
|
import numpy as np
|
|
import time
|
|
from typing import Dict, Any, Callable, List, Optional
|
|
from enum import Enum
|
|
from nucon import Nucon, BreakerStatus, PumpStatus, PumpDryStatus, PumpOverloadStatus
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Reward / objective helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
Objectives = {
|
|
"null": lambda obs: 0,
|
|
"max_power": lambda obs: obs["GENERATOR_0_KW"] + obs["GENERATOR_1_KW"] + obs["GENERATOR_2_KW"],
|
|
"episode_time": lambda obs: obs["EPISODE_TIME"],
|
|
}
|
|
|
|
Parameterized_Objectives = {
|
|
"target_temperature": lambda goal_temp: lambda obs: -((obs["CORE_TEMP"] - goal_temp) ** 2),
|
|
"target_gap": lambda goal_gap: lambda obs: -((obs["CORE_TEMP"] - obs["CORE_TEMP_MIN"] - goal_gap) ** 2),
|
|
"temp_below": lambda max_temp: lambda obs: -(np.clip(obs["CORE_TEMP"] - max_temp, 0, np.inf) ** 2),
|
|
"temp_above": lambda min_temp: lambda obs: -(np.clip(min_temp - obs["CORE_TEMP"], 0, np.inf) ** 2),
|
|
"constant": lambda constant: lambda obs: constant,
|
|
}
|
|
|
|
|
|
def UncertaintyPenalty(start: float = 0.3, scale: float = 1.0, mode: str = 'l2') -> Callable:
|
|
"""Objective that penalises high simulator uncertainty.
|
|
|
|
Returns a callable ``(obs) -> float`` suitable for use as an objective or
|
|
terminator in NuconEnv / NuconGoalEnv. Works because ``SIM_UNCERTAINTY``
|
|
is injected into the obs dict whenever a simulator is active.
|
|
|
|
Args:
|
|
start: uncertainty level at which the penalty starts (default 0.3).
|
|
scale: penalty coefficient.
|
|
mode: ``'l2'`` (quadratic, default) or ``'linear'``.
|
|
|
|
Example::
|
|
|
|
env = NuconEnv(
|
|
objectives=['max_power', UncertaintyPenalty(start=0.3, scale=2.0)],
|
|
objective_weights=[1.0, 1.0],
|
|
simulator=simulator,
|
|
)
|
|
"""
|
|
excess = lambda obs: max(0.0, obs.get('SIM_UNCERTAINTY', 0.0) - start)
|
|
if mode == 'l2':
|
|
return lambda obs: -scale * excess(obs) ** 2
|
|
elif mode == 'linear':
|
|
return lambda obs: -scale * excess(obs)
|
|
else:
|
|
raise ValueError(f"Unknown mode '{mode}'. Use 'l2' or 'linear'.")
|
|
|
|
|
|
def UncertaintyAbort(threshold: float = 0.7) -> Callable:
|
|
"""Terminator that aborts the episode when simulator uncertainty is too high.
|
|
|
|
Returns a callable ``(obs) -> float`` for use as a *terminator*. When
|
|
the GP posterior std exceeds ``threshold`` the episode is truncated
|
|
(``terminated=True``).
|
|
|
|
Example::
|
|
|
|
env = NuconEnv(
|
|
objectives=['max_power'],
|
|
terminators=[UncertaintyAbort(threshold=0.7)],
|
|
terminate_above=0,
|
|
simulator=simulator,
|
|
)
|
|
"""
|
|
return lambda obs: 1.0 if obs.get('SIM_UNCERTAINTY', 0.0) >= threshold else 0.0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Internal helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _build_param_space(param):
|
|
"""Return a gymnasium Box for a single NuconParameter, or None if unsupported."""
|
|
if param.param_type == float:
|
|
return spaces.Box(low=param.min_val or -np.inf, high=param.max_val or np.inf, shape=(1,), dtype=np.float32)
|
|
elif param.param_type == int:
|
|
lo = param.min_val if param.min_val is not None else -np.inf
|
|
hi = param.max_val if param.max_val is not None else np.inf
|
|
return spaces.Box(low=lo, high=hi, shape=(1,), dtype=np.float32)
|
|
elif param.param_type == bool:
|
|
return spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
|
|
elif param.param_type == str:
|
|
return None
|
|
elif issubclass(param.param_type, Enum):
|
|
return spaces.Box(low=0, high=len(param.param_type) - 1, shape=(1,), dtype=np.float32)
|
|
return None
|
|
|
|
|
|
def _apply_action(nucon, action):
|
|
for param_id, value in action.items():
|
|
param = nucon._parameters[param_id]
|
|
if issubclass(param.param_type, Enum):
|
|
value = param.param_type(int(np.asarray(value).flat[0]))
|
|
else:
|
|
value = param.param_type(np.asarray(value).flat[0])
|
|
if param.min_val is not None and param.max_val is not None:
|
|
value = np.clip(value, param.min_val, param.max_val)
|
|
nucon.set(param, value)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# NuconEnv
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class NuconEnv(gym.Env):
|
|
metadata = {'render_modes': ['human']}
|
|
|
|
def __init__(self, nucon=None, simulator=None, render_mode=None, seconds_per_step=5,
|
|
objectives=['null'], terminators=['null'], objective_weights=None, terminate_above=0):
|
|
super().__init__()
|
|
|
|
self.render_mode = render_mode
|
|
self.seconds_per_step = seconds_per_step
|
|
if objective_weights is None:
|
|
objective_weights = [1.0 for _ in objectives]
|
|
self.objective_weights = objective_weights
|
|
self.terminate_above = terminate_above
|
|
self.simulator = simulator
|
|
|
|
if nucon is None:
|
|
nucon = Nucon(port=simulator.port) if simulator else Nucon()
|
|
self.nucon = nucon
|
|
|
|
# Observation space — SIM_UNCERTAINTY included when a simulator is present
|
|
obs_spaces = {'EPISODE_TIME': spaces.Box(low=0, high=np.inf, shape=(1,), dtype=np.float32)}
|
|
if simulator is not None:
|
|
obs_spaces['SIM_UNCERTAINTY'] = spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
|
|
for param_id, param in self.nucon.get_all_readable().items():
|
|
sp = _build_param_space(param)
|
|
if sp is not None:
|
|
obs_spaces[param_id] = sp
|
|
self.observation_space = spaces.Dict(obs_spaces)
|
|
|
|
# Action space
|
|
action_spaces = {}
|
|
for param_id, param in self.nucon.get_all_writable().items():
|
|
if not param.is_readable or param.is_cheat:
|
|
continue
|
|
sp = _build_param_space(param)
|
|
if sp is not None:
|
|
action_spaces[param_id] = sp
|
|
self.action_space = spaces.Dict(action_spaces)
|
|
|
|
self.objectives = []
|
|
self.terminators = []
|
|
for objective in objectives:
|
|
if objective in Objectives:
|
|
self.objectives.append(Objectives[objective])
|
|
elif callable(objective):
|
|
self.objectives.append(objective)
|
|
else:
|
|
raise ValueError(f"Unsupported objective: {objective}")
|
|
for terminator in terminators:
|
|
if terminator in Objectives:
|
|
self.terminators.append(Objectives[terminator])
|
|
elif callable(terminator):
|
|
self.terminators.append(terminator)
|
|
else:
|
|
raise ValueError(f"Unsupported terminator: {terminator}")
|
|
|
|
def _get_obs(self, sim_uncertainty=None):
|
|
obs = {}
|
|
for param_id, param in self.nucon.get_all_readable().items():
|
|
if param.param_type == str or param_id not in self.observation_space.spaces:
|
|
continue
|
|
value = self.nucon.get(param_id)
|
|
if isinstance(value, Enum):
|
|
value = value.value
|
|
obs[param_id] = value
|
|
obs['EPISODE_TIME'] = self._total_steps * self.seconds_per_step
|
|
if 'SIM_UNCERTAINTY' in self.observation_space.spaces:
|
|
obs['SIM_UNCERTAINTY'] = sim_uncertainty if sim_uncertainty is not None else 0.0
|
|
return obs
|
|
|
|
def _get_info(self, obs):
|
|
info = {'objectives': {}, 'objectives_weighted': {}}
|
|
for objective, weight in zip(self.objectives, self.objective_weights):
|
|
obj = objective(obs)
|
|
name = getattr(objective, '__name__', repr(objective))
|
|
info['objectives'][name] = obj
|
|
info['objectives_weighted'][name] = obj * weight
|
|
return info
|
|
|
|
def reset(self, seed=None, options=None):
|
|
super().reset(seed=seed)
|
|
self._total_steps = 0
|
|
observation = self._get_obs()
|
|
return observation, self._get_info(observation)
|
|
|
|
def step(self, action):
|
|
_apply_action(self.nucon, action)
|
|
|
|
# Advance sim (or sleep) — get uncertainty for obs injection
|
|
truncated = False
|
|
uncertainty = None
|
|
if self.simulator:
|
|
uncertainty = self.simulator.update(self.seconds_per_step, return_uncertainty=True)
|
|
else:
|
|
sim_speed = self.nucon.GAME_SIM_SPEED.value or 1.0
|
|
time.sleep(self.seconds_per_step / sim_speed)
|
|
|
|
self._total_steps += 1
|
|
observation = self._get_obs(sim_uncertainty=uncertainty)
|
|
info = self._get_info(observation)
|
|
reward = sum(obj for obj in info['objectives_weighted'].values())
|
|
terminated = np.sum([t(observation) for t in self.terminators]) > self.terminate_above
|
|
return observation, reward, terminated, truncated, info
|
|
|
|
def render(self):
|
|
pass
|
|
|
|
def close(self):
|
|
pass
|
|
|
|
def _flatten_observation(self, observation):
|
|
return np.concatenate([np.asarray(v).flatten() for v in observation.values()])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# NuconGoalEnv
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class NuconGoalEnv(gym.Env):
|
|
"""
|
|
Goal-conditioned reactor environment compatible with SB3 HER (Hindsight Experience Replay).
|
|
|
|
Observation is a Dict with three keys:
|
|
- 'observation': all readable non-goal, non-str params + SIM_UNCERTAINTY (when sim active)
|
|
- 'achieved_goal': current values of goal_params, normalised to [0, 1] within goal_range
|
|
- 'desired_goal': target values sampled each episode, normalised to [0, 1]
|
|
|
|
``SIM_UNCERTAINTY`` in 'observation' lets reward_fn / terminators reference uncertainty directly.
|
|
|
|
reward_fn signature: ``(achieved, desired)`` or ``(achieved, desired, obs)`` — the 3-arg form
|
|
receives the full observation dict (including SIM_UNCERTAINTY) for uncertainty-aware shaping.
|
|
|
|
Usage with SB3 HER::
|
|
|
|
from stable_baselines3 import SAC
|
|
from stable_baselines3.common.buffers import HerReplayBuffer
|
|
from nucon.rl import NuconGoalEnv, UncertaintyPenalty, UncertaintyAbort
|
|
|
|
env = NuconGoalEnv(
|
|
goal_params=['GENERATOR_0_KW', 'GENERATOR_1_KW', 'GENERATOR_2_KW'],
|
|
goal_range={'GENERATOR_0_KW': (0, 1200), 'GENERATOR_1_KW': (0, 1200), 'GENERATOR_2_KW': (0, 1200)},
|
|
tolerance=0.05,
|
|
simulator=simulator,
|
|
# uncertainty-aware reward: penalise OOD, abort if too far out
|
|
reward_fn=lambda ag, dg, obs: (
|
|
-(np.linalg.norm(ag - dg) ** 2)
|
|
- 2.0 * max(0, obs.get('SIM_UNCERTAINTY', 0) - 0.3) ** 2
|
|
),
|
|
terminators=[UncertaintyAbort(threshold=0.7)],
|
|
)
|
|
model = SAC('MultiInputPolicy', env, replay_buffer_class=HerReplayBuffer)
|
|
model.learn(total_timesteps=500_000)
|
|
"""
|
|
|
|
metadata = {'render_modes': ['human']}
|
|
|
|
def __init__(
|
|
self,
|
|
goal_params,
|
|
goal_range=None,
|
|
reward_fn=None,
|
|
tolerance=None,
|
|
nucon=None,
|
|
simulator=None,
|
|
render_mode=None,
|
|
seconds_per_step=5,
|
|
terminators=None,
|
|
terminate_above=0,
|
|
):
|
|
super().__init__()
|
|
|
|
self.render_mode = render_mode
|
|
self.seconds_per_step = seconds_per_step
|
|
self.terminate_above = terminate_above
|
|
self.simulator = simulator
|
|
self.goal_params = list(goal_params)
|
|
self.tolerance = tolerance
|
|
|
|
if nucon is None:
|
|
nucon = Nucon(port=simulator.port) if simulator else Nucon()
|
|
self.nucon = nucon
|
|
|
|
all_readable = self.nucon.get_all_readable()
|
|
for pid in self.goal_params:
|
|
if pid not in all_readable:
|
|
raise ValueError(f"Goal param '{pid}' is not a readable parameter")
|
|
|
|
goal_range = goal_range or {}
|
|
self._goal_low = np.array([
|
|
goal_range.get(pid, (all_readable[pid].min_val or 0.0, all_readable[pid].max_val or 1.0))[0]
|
|
for pid in self.goal_params
|
|
], dtype=np.float32)
|
|
self._goal_high = np.array([
|
|
goal_range.get(pid, (all_readable[pid].min_val or 0.0, all_readable[pid].max_val or 1.0))[1]
|
|
for pid in self.goal_params
|
|
], dtype=np.float32)
|
|
self._goal_range = self._goal_high - self._goal_low
|
|
self._goal_range[self._goal_range == 0] = 1.0
|
|
|
|
# Detect reward_fn arity for backward compat (2-arg vs 3-arg)
|
|
self._reward_fn = reward_fn
|
|
if reward_fn is not None:
|
|
n_args = len(inspect.signature(reward_fn).parameters)
|
|
self._reward_fn_wants_obs = n_args >= 3
|
|
else:
|
|
self._reward_fn_wants_obs = False
|
|
|
|
# Observation subspace
|
|
goal_set = set(self.goal_params)
|
|
obs_spaces = {'EPISODE_TIME': spaces.Box(low=0, high=np.inf, shape=(1,), dtype=np.float32)}
|
|
if simulator is not None:
|
|
obs_spaces['SIM_UNCERTAINTY'] = spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
|
|
for param_id, param in all_readable.items():
|
|
if param_id in goal_set:
|
|
continue
|
|
sp = _build_param_space(param)
|
|
if sp is not None:
|
|
obs_spaces[param_id] = sp
|
|
|
|
n_goals = len(self.goal_params)
|
|
self.observation_space = spaces.Dict({
|
|
'observation': spaces.Dict(obs_spaces),
|
|
'achieved_goal': spaces.Box(low=0.0, high=1.0, shape=(n_goals,), dtype=np.float32),
|
|
'desired_goal': spaces.Box(low=0.0, high=1.0, shape=(n_goals,), dtype=np.float32),
|
|
})
|
|
|
|
# Action space
|
|
action_spaces = {}
|
|
for param_id, param in self.nucon.get_all_writable().items():
|
|
if not param.is_readable or param.is_cheat:
|
|
continue
|
|
sp = _build_param_space(param)
|
|
if sp is not None:
|
|
action_spaces[param_id] = sp
|
|
self.action_space = spaces.Dict(action_spaces)
|
|
|
|
self._terminators = terminators or []
|
|
self._desired_goal = np.zeros(n_goals, dtype=np.float32)
|
|
self._total_steps = 0
|
|
|
|
def compute_reward(self, achieved_goal, desired_goal, info):
|
|
"""Dense negative L2, sparse with tolerance, or custom reward_fn."""
|
|
obs = info.get('obs', {}) if isinstance(info, dict) else {}
|
|
if self._reward_fn is not None:
|
|
if self._reward_fn_wants_obs:
|
|
return self._reward_fn(achieved_goal, desired_goal, obs)
|
|
return self._reward_fn(achieved_goal, desired_goal)
|
|
dist = np.linalg.norm(achieved_goal - desired_goal, axis=-1)
|
|
if self.tolerance is not None:
|
|
return (dist <= self.tolerance).astype(np.float32) - 1.0
|
|
return -dist
|
|
|
|
def _read_goal_values(self):
|
|
raw = np.array([self.nucon.get(pid) or 0.0 for pid in self.goal_params], dtype=np.float32)
|
|
return np.clip((raw - self._goal_low) / self._goal_range, 0.0, 1.0)
|
|
|
|
def _get_obs_dict(self, sim_uncertainty=None):
|
|
obs = {'EPISODE_TIME': float(self._total_steps * self.seconds_per_step)}
|
|
if 'SIM_UNCERTAINTY' in self.observation_space['observation'].spaces:
|
|
obs['SIM_UNCERTAINTY'] = sim_uncertainty if sim_uncertainty is not None else 0.0
|
|
goal_set = set(self.goal_params)
|
|
for param_id, param in self.nucon.get_all_readable().items():
|
|
if param_id in goal_set or param_id not in self.observation_space['observation'].spaces:
|
|
continue
|
|
value = self.nucon.get(param_id)
|
|
if isinstance(value, Enum):
|
|
value = value.value
|
|
obs[param_id] = value
|
|
achieved = self._read_goal_values()
|
|
return {
|
|
'observation': obs,
|
|
'achieved_goal': achieved,
|
|
'desired_goal': self._desired_goal.copy(),
|
|
}
|
|
|
|
def reset(self, seed=None, options=None):
|
|
super().reset(seed=seed)
|
|
self._total_steps = 0
|
|
rng = np.random.default_rng(seed)
|
|
self._desired_goal = rng.uniform(0.0, 1.0, size=len(self.goal_params)).astype(np.float32)
|
|
return self._get_obs_dict(), {}
|
|
|
|
def step(self, action):
|
|
_apply_action(self.nucon, action)
|
|
|
|
# Advance sim (or sleep)
|
|
uncertainty = None
|
|
if self.simulator:
|
|
uncertainty = self.simulator.update(self.seconds_per_step, return_uncertainty=True)
|
|
else:
|
|
sim_speed = self.nucon.GAME_SIM_SPEED.value or 1.0
|
|
time.sleep(self.seconds_per_step / sim_speed)
|
|
|
|
self._total_steps += 1
|
|
obs = self._get_obs_dict(sim_uncertainty=uncertainty)
|
|
info = {'achieved_goal': obs['achieved_goal'], 'desired_goal': obs['desired_goal'],
|
|
'obs': obs['observation']}
|
|
reward = float(self.compute_reward(obs['achieved_goal'], obs['desired_goal'], info))
|
|
terminated = any(t(obs['observation']) > self.terminate_above for t in self._terminators)
|
|
truncated = False
|
|
return obs, reward, terminated, truncated, info
|
|
|
|
def render(self):
|
|
pass
|
|
|
|
def close(self):
|
|
pass
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Registration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def register_nucon_envs():
|
|
gym.register(
|
|
id='Nucon-max_power-v0',
|
|
entry_point='nucon.rl:NuconEnv',
|
|
kwargs={'seconds_per_step': 5, 'objectives': ['max_power']}
|
|
)
|
|
gym.register(
|
|
id='Nucon-target_temperature_350-v0',
|
|
entry_point='nucon.rl:NuconEnv',
|
|
kwargs={'seconds_per_step': 5, 'objectives': [Parameterized_Objectives['target_temperature'](goal_temp=350)]}
|
|
)
|
|
gym.register(
|
|
id='Nucon-safe_max_power-v0',
|
|
entry_point='nucon.rl:NuconEnv',
|
|
kwargs={'seconds_per_step': 5,
|
|
'objectives': [Parameterized_Objectives['temp_above'](min_temp=310),
|
|
Parameterized_Objectives['temp_below'](max_temp=365), 'max_power'],
|
|
'objective_weights': [1, 10, 1/100_000]}
|
|
)
|
|
gym.register(
|
|
id='Nucon-goal_power-v0',
|
|
entry_point='nucon.rl:NuconGoalEnv',
|
|
kwargs={
|
|
'goal_params': ['GENERATOR_0_KW', 'GENERATOR_1_KW', 'GENERATOR_2_KW'],
|
|
'goal_range': {'GENERATOR_0_KW': (0.0, 1200.0), 'GENERATOR_1_KW': (0.0, 1200.0), 'GENERATOR_2_KW': (0.0, 1200.0)},
|
|
'seconds_per_step': 5,
|
|
}
|
|
)
|
|
gym.register(
|
|
id='Nucon-goal_temp-v0',
|
|
entry_point='nucon.rl:NuconGoalEnv',
|
|
kwargs={
|
|
'goal_params': ['CORE_TEMP'],
|
|
'goal_range': {'CORE_TEMP': (280.0, 380.0)},
|
|
'seconds_per_step': 5,
|
|
}
|
|
)
|
|
|
|
register_nucon_envs()
|