From 336c96bb7b8a5ff395884ca734fc4537bff6efdf Mon Sep 17 00:00:00 2001 From: "ys1087@partner.kit.edu" Date: Tue, 22 Jul 2025 16:15:30 +0200 Subject: [PATCH] Add HoReKa cluster support with SLURM and wandb integration - Add complete HoReKa installation guide without conda dependency - Include SLURM job script with GPU configuration and account setup - Add helper scripts for job submission and environment testing - Integrate wandb logging with both online and offline modes - Support MuJoCo Playground environments for humanoid control - Update README with clear separation of added vs original content --- README.md | 97 ++++++++++++++++++++++++++++++++++++++ run_fasttd3.slurm | 44 ++++++++++++++++++ submit_job.py | 116 ++++++++++++++++++++++++++++++++++++++++++++++ test_setup.py | 94 +++++++++++++++++++++++++++++++++++++ 4 files changed, 351 insertions(+) create mode 100644 run_fasttd3.slurm create mode 100755 submit_job.py create mode 100644 test_setup.py diff --git a/README.md b/README.md index 9ccadf7..af6aefe 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,103 @@ FastTD3 is a high-performance variant of the Twin Delayed Deep Deterministic Pol For more information, please see our [project webpage](https://younggyo.me/fast_td3) +## šŸ”ļø HoReKa Cluster Setup +*Added by Dominik - Custom setup for HoReKa supercomputer* + +**Quick Setup for HoReKa Users:** + +This repository includes optimized scripts for running FastTD3 on the HoReKa supercomputer cluster with SLURM job scheduling and wandb logging. + +### Installation on HoReKa + +```bash +# Clone the repository +git clone https://github.com/younggyoseo/FastTD3.git +cd FastTD3 + +# Install Python 3.10 locally (HoReKa doesn't provide conda) +mkdir -p $HOME/.local/python-3.10 +cd /tmp +curl -O https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz +tar -xzf Python-3.10.14.tgz +cd Python-3.10.14 +./configure --prefix=$HOME/.local/python-3.10 --enable-optimizations --with-ensurepip=install +make -j$(nproc) +make install + +# Add to PATH +echo 'export PATH="$HOME/.local/python-3.10/bin:$PATH"' >> ~/.bashrc +echo 'export PATH="$HOME/.local/python-3.10/bin:$PATH"' >> ~/.zshrc +export PATH="$HOME/.local/python-3.10/bin:$PATH" + +# Go back to FastTD3 directory +cd $HOME/path/to/FastTD3 + +# Create virtual environment and install dependencies +$HOME/.local/python-3.10/bin/python3.10 -m venv .venv +source .venv/bin/activate +pip install --upgrade pip +pip install -r requirements/requirements.txt +pip install git+https://github.com/younggyoseo/mujoco_playground.git + +# Test installation +python test_setup.py +``` + +### Running on HoReKa + +**Easy submission:** +```bash +python submit_job.py +``` + +**Manual submission:** +```bash +sbatch run_fasttd3.slurm +``` + +**Monitor jobs:** +```bash +# Check job status +squeue -u $USER + +# View output +tail -f fasttd3_.out + +# Cancel job if needed +scancel +``` + +### Configuration + +The setup includes: +- **SLURM script** (`run_fasttd3.slurm`) configured for accelerated partition with GPU +- **Job helper** (`submit_job.py`) for easy job submission with wandb setup +- **Test script** (`test_setup.py`) for environment verification +- **MuJoCo Playground environment** (`T1JoystickFlatTerrain`) for humanoid control +- **Automatic GPU detection** and CUDA 12.4 compatibility +- **Wandb logging** with offline mode support + +### Wandb Integration + +The scripts support both online and offline wandb logging: + +**Online mode:** +```bash +export WANDB_API_KEY=your_api_key_here +python submit_job.py +# Select 'y' when prompted for online mode +``` + +**Offline mode (default):** +```bash +# Jobs run in offline mode by default +# Sync later with: wandb sync +``` + +--- + +# ORIGINAL README: ## ā— Updates diff --git a/run_fasttd3.slurm b/run_fasttd3.slurm new file mode 100644 index 0000000..1f59e97 --- /dev/null +++ b/run_fasttd3.slurm @@ -0,0 +1,44 @@ +#!/bin/bash +#SBATCH --job-name=fasttd3_test +#SBATCH --account=hk-project-p0022232 +#SBATCH --partition=accelerated +#SBATCH --time=02:00:00 +#SBATCH --gres=gpu:1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=32G +#SBATCH --output=fasttd3_%j.out +#SBATCH --error=fasttd3_%j.err + +# Load necessary modules +module purge +module load toolkit/CUDA/12.4 + +# Navigate to the project directory +cd $SLURM_SUBMIT_DIR + +# Activate the virtual environment +source .venv/bin/activate + +# Set environment variables for proper GPU usage +export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID +export JAX_PLATFORMS="gpu,cpu" + +# Ensure wandb is logged in (set WANDB_API_KEY environment variable) +# export WANDB_API_KEY=your_api_key_here +# For testing, use offline mode +export WANDB_MODE=offline + +# Run FastTD3 training with MuJoCo Playground environment +python fast_td3/train.py \ + --env_name T1JoystickFlatTerrain \ + --exp_name FastTD3_HoReKa_Test \ + --seed 42 \ + --total_timesteps 25000 \ + --num_envs 1024 \ + --batch_size 4096 \ + --eval_interval 5000 \ + --render_interval 0 \ + --project FastTD3_HoReKa + +echo "Job completed at $(date)" \ No newline at end of file diff --git a/submit_job.py b/submit_job.py new file mode 100755 index 0000000..938cff1 --- /dev/null +++ b/submit_job.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +""" +Helper script to submit FastTD3 jobs to SLURM with proper wandb setup. +""" +import os +import subprocess +import sys + +def check_wandb_setup(): + """Check if wandb is properly configured.""" + try: + import wandb + # Try to initialize in offline mode to test setup + wandb.init(mode="offline") + wandb.finish() + print("āœ“ wandb is properly installed") + return True + except Exception as e: + print(f"āœ— wandb setup issue: {e}") + return False + +def check_environment(): + """Check if we're in the right environment and directory.""" + if not os.path.exists('.venv'): + print("āœ— Virtual environment not found. Run from the FastTD3 directory.") + return False + + if not os.path.exists('fast_td3/train.py'): + print("āœ— FastTD3 training script not found. Run from the FastTD3 directory.") + return False + + print("āœ“ Environment looks good") + return True + +def submit_job(script_path="run_fasttd3.slurm", use_wandb_online=False): + """Submit the SLURM job.""" + if not os.path.exists(script_path): + print(f"āœ— SLURM script {script_path} not found") + return False + + print(f"Submitting job with script: {script_path}") + + # If using online wandb, prompt for API key + if use_wandb_online: + api_key = input("Enter your wandb API key (or press Enter to skip): ").strip() + if api_key: + # Update the script to set the API key + with open(script_path, 'r') as f: + content = f.read() + + # Replace the commented API key line + content = content.replace( + "# export WANDB_API_KEY=your_api_key_here", + f"export WANDB_API_KEY={api_key}" + ) + # Remove offline mode + content = content.replace( + "export WANDB_MODE=offline", + "# export WANDB_MODE=offline # Using online mode" + ) + + with open(script_path, 'w') as f: + f.write(content) + print("āœ“ Updated script with wandb API key") + + try: + result = subprocess.run(['sbatch', script_path], + capture_output=True, text=True) + if result.returncode == 0: + print(f"āœ“ Job submitted successfully:") + print(result.stdout.strip()) + job_id = result.stdout.strip().split()[-1] + print(f"\nTo monitor the job:") + print(f" squeue -j {job_id}") + print(f" tail -f fasttd3_{job_id}.out") + return True + else: + print(f"āœ— Job submission failed:") + print(result.stderr.strip()) + return False + except FileNotFoundError: + print("āœ— sbatch command not found. Are you on a SLURM cluster?") + return False + except Exception as e: + print(f"āœ— Error submitting job: {e}") + return False + +def main(): + print("FastTD3 Job Submission Helper") + print("=" * 30) + + # Check environment + if not check_environment(): + sys.exit(1) + + if not check_wandb_setup(): + sys.exit(1) + + # Ask user about wandb mode + use_online = input("Use wandb online mode? (y/N): ").lower().startswith('y') + + # Submit job + if submit_job(use_wandb_online=use_online): + print("\nšŸŽ‰ Job submitted successfully!") + print("\nTips:") + print("- Check job status: squeue -u $USER") + print("- View output: tail -f fasttd3_.out") + print("- Cancel job: scancel ") + if not use_online: + print("- Job runs in wandb offline mode. Sync later with: wandb sync ") + else: + print("\nāŒ Job submission failed") + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test_setup.py b/test_setup.py new file mode 100644 index 0000000..e3672d8 --- /dev/null +++ b/test_setup.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +""" +Test script to verify FastTD3 setup is working correctly. +This runs a minimal test to ensure all components are functioning. +""" +import os +import torch +import gymnasium as gym +import wandb +from fast_td3.hyperparams import get_args + +def test_basic_imports(): + """Test that all required packages can be imported.""" + print("Testing basic imports...") + try: + import torch + import gymnasium as gym + import wandb + import numpy as np + import tensordict + print("āœ“ All basic packages imported successfully") + return True + except ImportError as e: + print(f"āœ— Import error: {e}") + return False + +def test_gpu_availability(): + """Test GPU availability.""" + print("Testing GPU availability...") + if torch.cuda.is_available(): + print(f"āœ“ CUDA available, {torch.cuda.device_count()} GPU(s) found") + print(f" Current device: {torch.cuda.get_device_name(0)}") + return True + else: + print("⚠ CUDA not available, will run on CPU") + return False + +def test_environment(): + """Test that we can create a simple environment.""" + print("Testing environment creation...") + try: + env = gym.make("Pendulum-v1") + obs, info = env.reset() + print(f"āœ“ Environment created successfully") + print(f" Observation space: {env.observation_space}") + print(f" Action space: {env.action_space}") + env.close() + return True + except Exception as e: + print(f"āœ— Environment creation failed: {e}") + return False + +def test_wandb_setup(): + """Test wandb setup (without actual login).""" + print("Testing wandb setup...") + try: + # Just test that wandb can be initialized in offline mode + os.environ["WANDB_MODE"] = "offline" + wandb.init(project="test", mode="offline") + wandb.finish() + print("āœ“ wandb can be initialized") + return True + except Exception as e: + print(f"āœ— wandb setup failed: {e}") + return False + +def main(): + print("FastTD3 Setup Test") + print("==================") + + tests = [ + test_basic_imports, + test_gpu_availability, + test_environment, + test_wandb_setup, + ] + + passed = 0 + for test in tests: + if test(): + passed += 1 + print() + + print(f"Results: {passed}/{len(tests)} tests passed") + + if passed == len(tests): + print("šŸŽ‰ All tests passed! Setup looks good.") + return True + else: + print("āŒ Some tests failed. Check the output above.") + return False + +if __name__ == "__main__": + main() \ No newline at end of file