fix: flat Box action space, SB3/HER compatibility, sim uninitialized param defaults
rl.py: - Action space is now a flat Box (SAC/PPO require this, not Dict) - _build_flat_action_space + _unflatten_action helpers shared by both envs - Params with undefined bounds excluded from action space (SAC needs finite bounds) - Fix _build_param_space: use `is not None` check instead of falsy `or` (0 is valid min_val) - NuconGoalEnv obs params default to simulator.model.input_params when sim provided; obs_params kwarg overrides for real-game deployment with same param set - SIM_UNCERTAINTY kept out of policy obs vector (not available at deployment); available in reward_obs passed to objectives/terminators/reward_fn - _read_obs returns (gym_obs, reward_obs) cleanly instead of smuggling via dict - NuconGoalEnv additional_objectives wired into step() sim.py: - Uninitialized params return type-default (0/False/first-enum) instead of "None" - Enum params serialised as integer value, not repr string README.md: - Fix HerReplayBuffer import path (sb3 2.x: her.her_replay_buffer) - Remove non-existent simulator.run() call - Fix broken anchor links, remove "work in progress" from intro Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
845ca708a7
commit
3dfe1aa673
13
README.md
13
README.md
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
NuCon (Nucleares Controller) is a Python library designed to interface with and control parameters in [Nucleares](https://store.steampowered.com/app/1428420/Nucleares/), a nuclear reactor simulation game. It provides a robust, type-safe foundation for reading and writing game parameters, allowing users to easily create their own automations and control systems.
|
NuCon (Nucleares Controller) is a Python library designed to interface with and control parameters in [Nucleares](https://store.steampowered.com/app/1428420/Nucleares/), a nuclear reactor simulation game. It provides a robust, type-safe foundation for reading and writing game parameters, allowing users to easily create their own automations and control systems.
|
||||||
|
|
||||||
NuCon further provides a work in progress implementation of a reinforcement learning environment for training control policies and a simulator based on model learning.
|
NuCon further provides a reinforcement learning environment for training control policies and a simulator based on model learning.
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> NuCon is compatible with Nucleares v2.2.25.213. The game exposes a rich set of writable parameters including individual rod bank positions (`ROD_BANK_POS_{0-8}_ORDERED`), pump speeds, MSCV and turbine bypass setpoints, and various switches. Core chemistry parameters (e.g. Xenon concentration) are still read-only. Development on the advanced features (Reinforcement / Model Learning) is ongoing.
|
> NuCon is compatible with Nucleares v2.2.25.213. The game exposes a rich set of writable parameters including individual rod bank positions (`ROD_BANK_POS_{0-8}_ORDERED`), pump speeds, MSCV and turbine bypass setpoints, and various switches. Core chemistry parameters (e.g. Xenon concentration) are still read-only. Development on the advanced features (Reinforcement / Model Learning) is ongoing.
|
||||||
@ -198,7 +198,7 @@ HER works by relabelling past trajectories with the goal that was *actually achi
|
|||||||
```python
|
```python
|
||||||
from nucon.rl import NuconGoalEnv, Parameterized_Objectives, Parameterized_Terminators
|
from nucon.rl import NuconGoalEnv, Parameterized_Objectives, Parameterized_Terminators
|
||||||
from stable_baselines3 import SAC
|
from stable_baselines3 import SAC
|
||||||
from stable_baselines3.common.buffers import HerReplayBuffer
|
from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
|
||||||
|
|
||||||
|
|
||||||
env = NuconGoalEnv(
|
env = NuconGoalEnv(
|
||||||
@ -284,10 +284,7 @@ simulator.load_model('path/to/model.pth')
|
|||||||
# Set initial state (optional)
|
# Set initial state (optional)
|
||||||
simulator.set_state(OperatingState.NOMINAL)
|
simulator.set_state(OperatingState.NOMINAL)
|
||||||
|
|
||||||
# Run the simulator, will start the web server
|
# The web server starts automatically in __init__; access via nucon using the simulator's port
|
||||||
simulator.run()
|
|
||||||
|
|
||||||
# Access via nucon by using the simulator's port
|
|
||||||
nucon = Nucon(port=simulator.port)
|
nucon = Nucon(port=simulator.port)
|
||||||
|
|
||||||
# Or use the simulator with NuconEnv
|
# Or use the simulator with NuconEnv
|
||||||
@ -407,11 +404,11 @@ The recommended end-to-end workflow for training an RL operator is an iterative
|
|||||||
└─────────────────────┘
|
└─────────────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
**Step 1 — Human dataset collection**: Run `NuconModelLearner.collect_data()` during your play session. Cover a wide range of states: startup from cold, ramping power, individual rod bank adjustments. Diversity in the dataset directly determines simulator accuracy. See [Model Learning](#model-learning-work-in-progress) for collection details.
|
**Step 1 — Human dataset collection**: Run `NuconModelLearner.collect_data()` during your play session. Cover a wide range of states: startup from cold, ramping power, individual rod bank adjustments. Diversity in the dataset directly determines simulator accuracy. See [Model Learning](#model-learning) for collection details.
|
||||||
|
|
||||||
**Step 2 — Initial model fitting**: Fit a kNN-GP model (instant) or NN (better extrapolation with larger datasets) using `fit_knn()` or `train_model()`. Prune near-duplicate samples with `drop_redundant()` before fitting. See [Model Learning](#model-learning).
|
**Step 2 — Initial model fitting**: Fit a kNN-GP model (instant) or NN (better extrapolation with larger datasets) using `fit_knn()` or `train_model()`. Prune near-duplicate samples with `drop_redundant()` before fitting. See [Model Learning](#model-learning).
|
||||||
|
|
||||||
**Step 3 — Train RL in simulator**: Load the fitted model into `NuconSimulator`, then train a `NuconGoalEnv` policy with SAC + HER. The simulator runs far faster than the real game, allowing many trajectories in reasonable time. Pass `UncertaintyPenalty` and `UncertaintyAbort` as objectives/terminators to discourage the policy from wandering into regions the model hasn't seen; `SIM_UNCERTAINTY` is automatically injected into the obs dict when a simulator is active. See [NuconGoalEnv + HER Usage](#nucongoalenv--her-usage).
|
**Step 3 — Train RL in simulator**: Load the fitted model into `NuconSimulator`, then train a `NuconGoalEnv` policy with SAC + HER. The simulator runs far faster than the real game, allowing many trajectories in reasonable time. Pass `Parameterized_Objectives['uncertainty_penalty']` and `Parameterized_Terminators['uncertainty_abort']` as additional objectives/terminators to discourage the policy from wandering into regions the model hasn't seen; `SIM_UNCERTAINTY` is automatically injected into the obs dict when a simulator is active. See [NuconGoalEnv + HER Usage](#nucongoalenv--her-usage).
|
||||||
|
|
||||||
**Step 4 — Eval in game + collect new data**: Run the trained policy against the real game. This validates simulator accuracy and simultaneously collects new data from states the policy visits, which may be regions the original dataset missed. Run a second `NuconModelLearner` in a background thread to collect concurrently.
|
**Step 4 — Eval in game + collect new data**: Run the trained policy against the real game. This validates simulator accuracy and simultaneously collects new data from states the policy visits, which may be regions the original dataset missed. Run a second `NuconModelLearner` in a background thread to collect concurrently.
|
||||||
|
|
||||||
|
|||||||
144
nucon/rl.py
144
nucon/rl.py
@ -49,11 +49,38 @@ Parameterized_Terminators = {
|
|||||||
# Internal helpers
|
# Internal helpers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _build_flat_action_space(nucon, obs_param_set=None):
|
||||||
|
"""Return (Box, ordered_param_ids) for all writable, readable, non-cheat params.
|
||||||
|
|
||||||
|
If obs_param_set is provided, only include params in that set.
|
||||||
|
"""
|
||||||
|
params = []
|
||||||
|
lows, highs = [], []
|
||||||
|
for param_id, param in nucon.get_all_writable().items():
|
||||||
|
if not param.is_readable or param.is_cheat:
|
||||||
|
continue
|
||||||
|
if obs_param_set is not None and param_id not in obs_param_set:
|
||||||
|
continue
|
||||||
|
if param.min_val is None or param.max_val is None:
|
||||||
|
continue # SAC requires finite action bounds
|
||||||
|
sp = _build_param_space(param)
|
||||||
|
if sp is None:
|
||||||
|
continue
|
||||||
|
params.append(param_id)
|
||||||
|
lows.append(sp.low[0])
|
||||||
|
highs.append(sp.high[0])
|
||||||
|
box = spaces.Box(low=np.array(lows, dtype=np.float32),
|
||||||
|
high=np.array(highs, dtype=np.float32), dtype=np.float32)
|
||||||
|
return box, params
|
||||||
|
|
||||||
|
|
||||||
|
def _unflatten_action(flat_action, param_ids):
|
||||||
|
return {pid: float(flat_action[i]) for i, pid in enumerate(param_ids)}
|
||||||
|
|
||||||
|
|
||||||
def _build_param_space(param):
|
def _build_param_space(param):
|
||||||
"""Return a gymnasium Box for a single NuconParameter, or None if unsupported."""
|
"""Return a gymnasium Box for a single NuconParameter, or None if unsupported."""
|
||||||
if param.param_type == float:
|
if param.param_type in (float, int):
|
||||||
return spaces.Box(low=param.min_val or -np.inf, high=param.max_val or np.inf, shape=(1,), dtype=np.float32)
|
|
||||||
elif param.param_type == int:
|
|
||||||
lo = param.min_val if param.min_val is not None else -np.inf
|
lo = param.min_val if param.min_val is not None else -np.inf
|
||||||
hi = param.max_val if param.max_val is not None else np.inf
|
hi = param.max_val if param.max_val is not None else np.inf
|
||||||
return spaces.Box(low=lo, high=hi, shape=(1,), dtype=np.float32)
|
return spaces.Box(low=lo, high=hi, shape=(1,), dtype=np.float32)
|
||||||
@ -111,15 +138,7 @@ class NuconEnv(gym.Env):
|
|||||||
obs_spaces[param_id] = sp
|
obs_spaces[param_id] = sp
|
||||||
self.observation_space = spaces.Dict(obs_spaces)
|
self.observation_space = spaces.Dict(obs_spaces)
|
||||||
|
|
||||||
# Action space
|
self.action_space, self._action_params = _build_flat_action_space(self.nucon)
|
||||||
action_spaces = {}
|
|
||||||
for param_id, param in self.nucon.get_all_writable().items():
|
|
||||||
if not param.is_readable or param.is_cheat:
|
|
||||||
continue
|
|
||||||
sp = _build_param_space(param)
|
|
||||||
if sp is not None:
|
|
||||||
action_spaces[param_id] = sp
|
|
||||||
self.action_space = spaces.Dict(action_spaces)
|
|
||||||
|
|
||||||
self.objectives = []
|
self.objectives = []
|
||||||
self.terminators = []
|
self.terminators = []
|
||||||
@ -168,7 +187,7 @@ class NuconEnv(gym.Env):
|
|||||||
return observation, self._get_info(observation)
|
return observation, self._get_info(observation)
|
||||||
|
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
_apply_action(self.nucon, action)
|
_apply_action(self.nucon, _unflatten_action(action, self._action_params))
|
||||||
|
|
||||||
# Advance sim (or sleep) — get uncertainty for obs injection
|
# Advance sim (or sleep) — get uncertainty for obs injection
|
||||||
truncated = False
|
truncated = False
|
||||||
@ -252,6 +271,7 @@ class NuconGoalEnv(gym.Env):
|
|||||||
terminate_above=0,
|
terminate_above=0,
|
||||||
additional_objectives=None,
|
additional_objectives=None,
|
||||||
additional_objective_weights=None,
|
additional_objective_weights=None,
|
||||||
|
obs_params=None,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@ -291,34 +311,35 @@ class NuconGoalEnv(gym.Env):
|
|||||||
else:
|
else:
|
||||||
self._reward_fn_wants_obs = False
|
self._reward_fn_wants_obs = False
|
||||||
|
|
||||||
# Observation subspace
|
# Observation params: model.input_params defines the canonical list — the same set is
|
||||||
|
# used whether training in sim or deploying to the real game (the game simply has more
|
||||||
|
# params available; we query only the subset we care about).
|
||||||
|
# Explicit obs_params overrides everything (use when deploying to real game without sim).
|
||||||
|
# SB3 HER requires observation to be a flat Box, not a nested Dict.
|
||||||
goal_set = set(self.goal_params)
|
goal_set = set(self.goal_params)
|
||||||
obs_spaces = {'EPISODE_TIME': spaces.Box(low=0, high=np.inf, shape=(1,), dtype=np.float32)}
|
self._obs_with_uncertainty = simulator is not None
|
||||||
if simulator is not None:
|
if obs_params is not None:
|
||||||
obs_spaces['SIM_UNCERTAINTY'] = spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
|
base_params = [p for p in obs_params if p not in goal_set]
|
||||||
for param_id, param in all_readable.items():
|
elif simulator is not None and hasattr(simulator, 'model') and simulator.model is not None:
|
||||||
if param_id in goal_set:
|
base_params = [p for p in simulator.model.input_params
|
||||||
continue
|
if p not in goal_set and p in all_readable
|
||||||
sp = _build_param_space(param)
|
and _build_param_space(all_readable[p]) is not None]
|
||||||
if sp is not None:
|
else:
|
||||||
obs_spaces[param_id] = sp
|
base_params = [p for p, param in all_readable.items()
|
||||||
|
if p not in goal_set and _build_param_space(param) is not None]
|
||||||
|
# SIM_UNCERTAINTY is not in _obs_params — it's not available at deployment on the real game
|
||||||
|
self._obs_params = base_params
|
||||||
|
|
||||||
n_goals = len(self.goal_params)
|
n_goals = len(self.goal_params)
|
||||||
self.observation_space = spaces.Dict({
|
self.observation_space = spaces.Dict({
|
||||||
'observation': spaces.Dict(obs_spaces),
|
'observation': spaces.Box(low=-np.inf, high=np.inf,
|
||||||
|
shape=(len(self._obs_params),), dtype=np.float32),
|
||||||
'achieved_goal': spaces.Box(low=0.0, high=1.0, shape=(n_goals,), dtype=np.float32),
|
'achieved_goal': spaces.Box(low=0.0, high=1.0, shape=(n_goals,), dtype=np.float32),
|
||||||
'desired_goal': spaces.Box(low=0.0, high=1.0, shape=(n_goals,), dtype=np.float32),
|
'desired_goal': spaces.Box(low=0.0, high=1.0, shape=(n_goals,), dtype=np.float32),
|
||||||
})
|
})
|
||||||
|
|
||||||
# Action space
|
# Action space: writable params within the obs param set (flat Box for SB3 compatibility).
|
||||||
action_spaces = {}
|
self.action_space, self._action_params = _build_flat_action_space(self.nucon, set(base_params))
|
||||||
for param_id, param in self.nucon.get_all_writable().items():
|
|
||||||
if not param.is_readable or param.is_cheat:
|
|
||||||
continue
|
|
||||||
sp = _build_param_space(param)
|
|
||||||
if sp is not None:
|
|
||||||
action_spaces[param_id] = sp
|
|
||||||
self.action_space = spaces.Dict(action_spaces)
|
|
||||||
|
|
||||||
self._terminators = terminators or []
|
self._terminators = terminators or []
|
||||||
_objs = additional_objectives or []
|
_objs = additional_objectives or []
|
||||||
@ -329,10 +350,10 @@ class NuconGoalEnv(gym.Env):
|
|||||||
|
|
||||||
def compute_reward(self, achieved_goal, desired_goal, info):
|
def compute_reward(self, achieved_goal, desired_goal, info):
|
||||||
"""Dense negative L2, sparse with tolerance, or custom reward_fn."""
|
"""Dense negative L2, sparse with tolerance, or custom reward_fn."""
|
||||||
obs = info.get('obs', {}) if isinstance(info, dict) else {}
|
obs_named = info.get('obs_named', {}) if isinstance(info, dict) else {}
|
||||||
if self._reward_fn is not None:
|
if self._reward_fn is not None:
|
||||||
if self._reward_fn_wants_obs:
|
if self._reward_fn_wants_obs:
|
||||||
return self._reward_fn(achieved_goal, desired_goal, obs)
|
return self._reward_fn(achieved_goal, desired_goal, obs_named)
|
||||||
return self._reward_fn(achieved_goal, desired_goal)
|
return self._reward_fn(achieved_goal, desired_goal)
|
||||||
dist = np.linalg.norm(achieved_goal - desired_goal, axis=-1)
|
dist = np.linalg.norm(achieved_goal - desired_goal, axis=-1)
|
||||||
if self.tolerance is not None:
|
if self.tolerance is not None:
|
||||||
@ -343,52 +364,53 @@ class NuconGoalEnv(gym.Env):
|
|||||||
raw = np.array([self.nucon.get(pid) or 0.0 for pid in self.goal_params], dtype=np.float32)
|
raw = np.array([self.nucon.get(pid) or 0.0 for pid in self.goal_params], dtype=np.float32)
|
||||||
return np.clip((raw - self._goal_low) / self._goal_range, 0.0, 1.0)
|
return np.clip((raw - self._goal_low) / self._goal_range, 0.0, 1.0)
|
||||||
|
|
||||||
def _get_obs_dict(self, sim_uncertainty=None):
|
def _read_obs(self, sim_uncertainty=None):
|
||||||
obs = {'EPISODE_TIME': float(self._total_steps * self.seconds_per_step)}
|
"""Return (gym_obs_dict, reward_obs_dict).
|
||||||
if 'SIM_UNCERTAINTY' in self.observation_space['observation'].spaces:
|
|
||||||
obs['SIM_UNCERTAINTY'] = sim_uncertainty if sim_uncertainty is not None else 0.0
|
gym_obs_dict — flat Box observation for the policy (no SIM_UNCERTAINTY).
|
||||||
goal_set = set(self.goal_params)
|
reward_obs_dict — same values plus SIM_UNCERTAINTY for objectives/terminators/reward_fn.
|
||||||
for param_id, param in self.nucon.get_all_readable().items():
|
"""
|
||||||
if param_id in goal_set or param_id not in self.observation_space['observation'].spaces:
|
reward_obs = {}
|
||||||
continue
|
if self._obs_with_uncertainty:
|
||||||
|
reward_obs['SIM_UNCERTAINTY'] = float(sim_uncertainty) if sim_uncertainty is not None else 0.0
|
||||||
|
for param_id in self._obs_params:
|
||||||
value = self.nucon.get(param_id)
|
value = self.nucon.get(param_id)
|
||||||
if isinstance(value, Enum):
|
if isinstance(value, Enum):
|
||||||
value = value.value
|
value = value.value
|
||||||
obs[param_id] = value
|
reward_obs[param_id] = float(value) if value is not None else 0.0
|
||||||
|
|
||||||
|
obs_vec = np.array([reward_obs[p] for p in self._obs_params], dtype=np.float32)
|
||||||
achieved = self._read_goal_values()
|
achieved = self._read_goal_values()
|
||||||
return {
|
gym_obs = {'observation': obs_vec, 'achieved_goal': achieved,
|
||||||
'observation': obs,
|
'desired_goal': self._desired_goal.copy()}
|
||||||
'achieved_goal': achieved,
|
return gym_obs, reward_obs
|
||||||
'desired_goal': self._desired_goal.copy(),
|
|
||||||
}
|
|
||||||
|
|
||||||
def reset(self, seed=None, options=None):
|
def reset(self, seed=None, options=None):
|
||||||
super().reset(seed=seed)
|
super().reset(seed=seed)
|
||||||
self._total_steps = 0
|
self._total_steps = 0
|
||||||
rng = np.random.default_rng(seed)
|
rng = np.random.default_rng(seed)
|
||||||
self._desired_goal = rng.uniform(0.0, 1.0, size=len(self.goal_params)).astype(np.float32)
|
self._desired_goal = rng.uniform(0.0, 1.0, size=len(self.goal_params)).astype(np.float32)
|
||||||
return self._get_obs_dict(), {}
|
gym_obs, _ = self._read_obs()
|
||||||
|
return gym_obs, {}
|
||||||
|
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
_apply_action(self.nucon, action)
|
_apply_action(self.nucon, _unflatten_action(action, self._action_params))
|
||||||
|
|
||||||
# Advance sim (or sleep)
|
|
||||||
uncertainty = None
|
|
||||||
if self.simulator:
|
if self.simulator:
|
||||||
uncertainty = self.simulator.update(self.seconds_per_step, return_uncertainty=True)
|
uncertainty = self.simulator.update(self.seconds_per_step, return_uncertainty=True)
|
||||||
else:
|
else:
|
||||||
sim_speed = self.nucon.GAME_SIM_SPEED.value or 1.0
|
sim_speed = self.nucon.GAME_SIM_SPEED.value or 1.0
|
||||||
time.sleep(self.seconds_per_step / sim_speed)
|
time.sleep(self.seconds_per_step / sim_speed)
|
||||||
|
uncertainty = None
|
||||||
|
|
||||||
self._total_steps += 1
|
self._total_steps += 1
|
||||||
obs = self._get_obs_dict(sim_uncertainty=uncertainty)
|
gym_obs, reward_obs = self._read_obs(sim_uncertainty=uncertainty)
|
||||||
info = {'achieved_goal': obs['achieved_goal'], 'desired_goal': obs['desired_goal'],
|
info = {'achieved_goal': gym_obs['achieved_goal'], 'desired_goal': gym_obs['desired_goal'],
|
||||||
'obs': obs['observation']}
|
'obs_named': reward_obs}
|
||||||
reward = float(self.compute_reward(obs['achieved_goal'], obs['desired_goal'], info))
|
reward = float(self.compute_reward(gym_obs['achieved_goal'], gym_obs['desired_goal'], info))
|
||||||
reward += sum(w * o(obs['observation']) for o, w in zip(self._objectives, self._objective_weights))
|
reward += sum(w * o(reward_obs) for o, w in zip(self._objectives, self._objective_weights))
|
||||||
terminated = any(t(obs['observation']) > self.terminate_above for t in self._terminators)
|
terminated = any(t(reward_obs) > self.terminate_above for t in self._terminators)
|
||||||
truncated = False
|
return gym_obs, reward, terminated, False, info
|
||||||
return obs, reward, terminated, truncated, info
|
|
||||||
|
|
||||||
def render(self):
|
def render(self):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@ -330,6 +330,14 @@ class NuconSimulator:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
value = self.get(variable)
|
value = self.get(variable)
|
||||||
|
if value is None:
|
||||||
|
param = self._nucon[variable]
|
||||||
|
if param.enum_type is not None:
|
||||||
|
value = next(iter(param.enum_type)).value # first enum member's int value
|
||||||
|
else:
|
||||||
|
value = param.param_type() # int()->0, float()->0.0, bool()->False
|
||||||
|
if isinstance(value, Enum):
|
||||||
|
value = value.value
|
||||||
return str(value), 200
|
return str(value), 200
|
||||||
except (KeyError, AttributeError):
|
except (KeyError, AttributeError):
|
||||||
return jsonify({"error": f"Unknown variable: {variable}"}), 404
|
return jsonify({"error": f"Unknown variable: {variable}"}), 404
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user