From e4c9f047d01e5977630f7f7ae97aec1e74a8970b Mon Sep 17 00:00:00 2001 From: Dominik Roth Date: Thu, 10 Oct 2024 17:27:12 +0200 Subject: [PATCH] Add PPO example to README --- README.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/README.md b/README.md index 494124e..5b0b647 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,44 @@ env.close() Objectives takes either strings of the name of predefined objectives, or lambda functions which take an observation and return a scalar reward. Final rewards are (weighted) summed across all objectives. `info['objectives']` contains all objectives and their values. +You can e.g. train an PPO agent using the [sb3](https://github.com/DLR-RM/stable-baselines3) implementation: +```python +from nucon.rl import NuconEnv +from stable_baselines3 import PPO + +env = NuconEnv(objectives=['max_power'], seconds_per_step=5) + +# Create the PPO (Proximal Policy Optimization) model +model = PPO( + "MlpPolicy", + env, + verbose=1, + learning_rate=3e-4, # You can adjust hyperparameters as needed + n_steps=2048, + batch_size=64, + n_epochs=10, + gamma=0.99, + gae_lambda=0.95, + clip_range=0.2, + ent_coef=0.01 +) + +# Train the model +model.learn(total_timesteps=100000) # Adjust total_timesteps as needed + +# Test the trained model +obs, info = env.reset() +for _ in range(1000): + action, _states = model.predict(obs, deterministic=True) + obs, reward, terminated, truncated, info = env.step(action) + + if terminated or truncated: + obs, info = env.reset() + +# Close the environment +env.close() +``` + But theres a problem: RL algorithms require a huge amount of training steps to get passable policies, and Nucleares is a very slow simulation and can not be trivially parallelized. That's why NuCon also provides a ## Simulator (Work in Progress)