fix: make collect_data resilient to game crashes

- Save dataset every N steps (default 10) so a disconnect loses at most one checkpoint's worth of samples instead of everything - Retry _get_state() on ConnectionError/Timeout rather than crashing, resuming automatically once the game comes back up Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-12 17:52:17 +01:00 · 2026-03-12 17:52:17 +01:00 · 088b7d4733
commit 088b7d4733
parent ce2019e060
1 changed files with 34 additions and 6 deletions
--- a/nucon/model.py
+++ b/nucon/model.py
@ -203,28 +203,56 @@ class NuconModelLearner:
                state[key] = valves.get(name, {}).get('Value', 0.0)
        return state

-    def collect_data(self, num_steps):
+    def collect_data(self, num_steps, save_every=10):
        """
        Collect state-transition tuples from the live game.

        Sleeps wall_time = target_game_delta / sim_speed so that each stored
        game_delta is uniform regardless of the game's simulation speed setting.
+
+        Saves the dataset every ``save_every`` steps so a crash doesn't lose
+        everything. On a connection error the step is skipped and collection
+        resumes once the game is reachable again (retries every 5 s).
        """
-        state = self._get_state()
-        for _ in range(num_steps):
+        import requests as _requests
+
+        def get_state_with_retry():
+            while True:
+                try:
+                    return self._get_state()
+                except (_requests.exceptions.ConnectionError,
+                        _requests.exceptions.Timeout) as e:
+                    print(f"Connection lost ({e}). Retrying in 5 s…")
+                    time.sleep(5)
+
+        state = get_state_with_retry()
+        collected = 0
+        for i in range(num_steps):
            action = self.actor(state)
            for param_id, value in action.items():
+                try:
                    self.nucon.set(param_id, value)
+                except Exception:
+                    pass

            target_game_delta = self.time_delta()
+            try:
                sim_speed = self.nucon.GAME_SIM_SPEED.value or 1.0
+            except Exception:
+                sim_speed = 1.0
            time.sleep(target_game_delta / sim_speed)
-            next_state = self._get_state()

+            next_state = get_state_with_retry()
            self.dataset.append((state, action, next_state, target_game_delta))
            state = next_state
+            collected += 1
+
+            if collected % save_every == 0:
+                self.save_dataset()
+                print(f"  {collected}/{num_steps} steps collected, dataset saved.")

        self.save_dataset()
+        print(f"Collection complete. {collected} steps, {len(self.dataset)} total samples.")

    def train_model(self, batch_size=32, num_epochs=10, test_split=0.2):
        """Train a neural-network dynamics model on the current dataset."""