From 336c96bb7b8a5ff395884ca734fc4537bff6efdf Mon Sep 17 00:00:00 2001
From: "ys1087@partner.kit.edu" <ys1087@hkn1991.localdomain>
Date: Tue, 22 Jul 2025 16:15:30 +0200
Subject: [PATCH] Add HoReKa cluster support with SLURM and wandb integration

- Add complete HoReKa installation guide without conda dependency
- Include SLURM job script with GPU configuration and account setup
- Add helper scripts for job submission and environment testing
- Integrate wandb logging with both online and offline modes
- Support MuJoCo Playground environments for humanoid control
- Update README with clear separation of added vs original content
---
 README.md         |  97 ++++++++++++++++++++++++++++++++++++++
 run_fasttd3.slurm |  44 ++++++++++++++++++
 submit_job.py     | 116 ++++++++++++++++++++++++++++++++++++++++++++++
 test_setup.py     |  94 +++++++++++++++++++++++++++++++++++++
 4 files changed, 351 insertions(+)
 create mode 100644 run_fasttd3.slurm
 create mode 100755 submit_job.py
 create mode 100644 test_setup.py

diff --git a/README.md b/README.md
index 9ccadf7..af6aefe 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,103 @@ FastTD3 is a high-performance variant of the Twin Delayed Deep Deterministic Pol
 
 For more information, please see our [project webpage](https://younggyo.me/fast_td3)
 
+## 🏔️ HoReKa Cluster Setup
+*Added by Dominik - Custom setup for HoReKa supercomputer*
+
+**Quick Setup for HoReKa Users:**
+
+This repository includes optimized scripts for running FastTD3 on the HoReKa supercomputer cluster with SLURM job scheduling and wandb logging.
+
+### Installation on HoReKa
+
+```bash
+# Clone the repository
+git clone https://github.com/younggyoseo/FastTD3.git
+cd FastTD3
+
+# Install Python 3.10 locally (HoReKa doesn't provide conda)
+mkdir -p $HOME/.local/python-3.10
+cd /tmp
+curl -O https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz
+tar -xzf Python-3.10.14.tgz
+cd Python-3.10.14
+./configure --prefix=$HOME/.local/python-3.10 --enable-optimizations --with-ensurepip=install
+make -j$(nproc)
+make install
+
+# Add to PATH
+echo 'export PATH="$HOME/.local/python-3.10/bin:$PATH"' >> ~/.bashrc
+echo 'export PATH="$HOME/.local/python-3.10/bin:$PATH"' >> ~/.zshrc
+export PATH="$HOME/.local/python-3.10/bin:$PATH"
+
+# Go back to FastTD3 directory
+cd $HOME/path/to/FastTD3
+
+# Create virtual environment and install dependencies
+$HOME/.local/python-3.10/bin/python3.10 -m venv .venv
+source .venv/bin/activate
+pip install --upgrade pip
+pip install -r requirements/requirements.txt
+pip install git+https://github.com/younggyoseo/mujoco_playground.git
+
+# Test installation
+python test_setup.py
+```
+
+### Running on HoReKa
+
+**Easy submission:**
+```bash
+python submit_job.py
+```
+
+**Manual submission:**
+```bash
+sbatch run_fasttd3.slurm
+```
+
+**Monitor jobs:**
+```bash
+# Check job status
+squeue -u $USER
+
+# View output
+tail -f fasttd3_<job_id>.out
+
+# Cancel job if needed
+scancel <job_id>
+```
+
+### Configuration
+
+The setup includes:
+- **SLURM script** (`run_fasttd3.slurm`) configured for accelerated partition with GPU
+- **Job helper** (`submit_job.py`) for easy job submission with wandb setup
+- **Test script** (`test_setup.py`) for environment verification
+- **MuJoCo Playground environment** (`T1JoystickFlatTerrain`) for humanoid control
+- **Automatic GPU detection** and CUDA 12.4 compatibility
+- **Wandb logging** with offline mode support
+
+### Wandb Integration
+
+The scripts support both online and offline wandb logging:
+
+**Online mode:**
+```bash
+export WANDB_API_KEY=your_api_key_here
+python submit_job.py
+# Select 'y' when prompted for online mode
+```
+
+**Offline mode (default):**
+```bash
+# Jobs run in offline mode by default
+# Sync later with: wandb sync <run_directory>
+```
+
+---
+
+# ORIGINAL README:
 
 ## ❗ Updates
 
diff --git a/run_fasttd3.slurm b/run_fasttd3.slurm
new file mode 100644
index 0000000..1f59e97
--- /dev/null
+++ b/run_fasttd3.slurm
@@ -0,0 +1,44 @@
+#!/bin/bash
+#SBATCH --job-name=fasttd3_test
+#SBATCH --account=hk-project-p0022232
+#SBATCH --partition=accelerated
+#SBATCH --time=02:00:00
+#SBATCH --gres=gpu:1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=32G
+#SBATCH --output=fasttd3_%j.out
+#SBATCH --error=fasttd3_%j.err
+
+# Load necessary modules
+module purge
+module load toolkit/CUDA/12.4
+
+# Navigate to the project directory
+cd $SLURM_SUBMIT_DIR
+
+# Activate the virtual environment
+source .venv/bin/activate
+
+# Set environment variables for proper GPU usage
+export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID
+export JAX_PLATFORMS="gpu,cpu"
+
+# Ensure wandb is logged in (set WANDB_API_KEY environment variable)
+# export WANDB_API_KEY=your_api_key_here
+# For testing, use offline mode
+export WANDB_MODE=offline
+
+# Run FastTD3 training with MuJoCo Playground environment
+python fast_td3/train.py \
+    --env_name T1JoystickFlatTerrain \
+    --exp_name FastTD3_HoReKa_Test \
+    --seed 42 \
+    --total_timesteps 25000 \
+    --num_envs 1024 \
+    --batch_size 4096 \
+    --eval_interval 5000 \
+    --render_interval 0 \
+    --project FastTD3_HoReKa
+
+echo "Job completed at $(date)"
\ No newline at end of file
diff --git a/submit_job.py b/submit_job.py
new file mode 100755
index 0000000..938cff1
--- /dev/null
+++ b/submit_job.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+"""
+Helper script to submit FastTD3 jobs to SLURM with proper wandb setup.
+"""
+import os
+import subprocess
+import sys
+
+def check_wandb_setup():
+    """Check if wandb is properly configured."""
+    try:
+        import wandb
+        # Try to initialize in offline mode to test setup
+        wandb.init(mode="offline")
+        wandb.finish()
+        print("✓ wandb is properly installed")
+        return True
+    except Exception as e:
+        print(f"✗ wandb setup issue: {e}")
+        return False
+
+def check_environment():
+    """Check if we're in the right environment and directory."""
+    if not os.path.exists('.venv'):
+        print("✗ Virtual environment not found. Run from the FastTD3 directory.")
+        return False
+    
+    if not os.path.exists('fast_td3/train.py'):
+        print("✗ FastTD3 training script not found. Run from the FastTD3 directory.")
+        return False
+    
+    print("✓ Environment looks good")
+    return True
+
+def submit_job(script_path="run_fasttd3.slurm", use_wandb_online=False):
+    """Submit the SLURM job."""
+    if not os.path.exists(script_path):
+        print(f"✗ SLURM script {script_path} not found")
+        return False
+    
+    print(f"Submitting job with script: {script_path}")
+    
+    # If using online wandb, prompt for API key
+    if use_wandb_online:
+        api_key = input("Enter your wandb API key (or press Enter to skip): ").strip()
+        if api_key:
+            # Update the script to set the API key
+            with open(script_path, 'r') as f:
+                content = f.read()
+            
+            # Replace the commented API key line
+            content = content.replace(
+                "# export WANDB_API_KEY=your_api_key_here",
+                f"export WANDB_API_KEY={api_key}"
+            )
+            # Remove offline mode
+            content = content.replace(
+                "export WANDB_MODE=offline",
+                "# export WANDB_MODE=offline  # Using online mode"
+            )
+            
+            with open(script_path, 'w') as f:
+                f.write(content)
+            print("✓ Updated script with wandb API key")
+    
+    try:
+        result = subprocess.run(['sbatch', script_path], 
+                              capture_output=True, text=True)
+        if result.returncode == 0:
+            print(f"✓ Job submitted successfully:")
+            print(result.stdout.strip())
+            job_id = result.stdout.strip().split()[-1]
+            print(f"\nTo monitor the job:")
+            print(f"  squeue -j {job_id}")
+            print(f"  tail -f fasttd3_{job_id}.out")
+            return True
+        else:
+            print(f"✗ Job submission failed:")
+            print(result.stderr.strip())
+            return False
+    except FileNotFoundError:
+        print("✗ sbatch command not found. Are you on a SLURM cluster?")
+        return False
+    except Exception as e:
+        print(f"✗ Error submitting job: {e}")
+        return False
+
+def main():
+    print("FastTD3 Job Submission Helper")
+    print("=" * 30)
+    
+    # Check environment
+    if not check_environment():
+        sys.exit(1)
+    
+    if not check_wandb_setup():
+        sys.exit(1)
+    
+    # Ask user about wandb mode
+    use_online = input("Use wandb online mode? (y/N): ").lower().startswith('y')
+    
+    # Submit job
+    if submit_job(use_wandb_online=use_online):
+        print("\n🎉 Job submitted successfully!")
+        print("\nTips:")
+        print("- Check job status: squeue -u $USER")
+        print("- View output: tail -f fasttd3_<jobid>.out")
+        print("- Cancel job: scancel <jobid>")
+        if not use_online:
+            print("- Job runs in wandb offline mode. Sync later with: wandb sync <run_dir>")
+    else:
+        print("\n❌ Job submission failed")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/test_setup.py b/test_setup.py
new file mode 100644
index 0000000..e3672d8
--- /dev/null
+++ b/test_setup.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""
+Test script to verify FastTD3 setup is working correctly.
+This runs a minimal test to ensure all components are functioning.
+"""
+import os
+import torch
+import gymnasium as gym
+import wandb
+from fast_td3.hyperparams import get_args
+
+def test_basic_imports():
+    """Test that all required packages can be imported."""
+    print("Testing basic imports...")
+    try:
+        import torch
+        import gymnasium as gym
+        import wandb
+        import numpy as np
+        import tensordict
+        print("✓ All basic packages imported successfully")
+        return True
+    except ImportError as e:
+        print(f"✗ Import error: {e}")
+        return False
+
+def test_gpu_availability():
+    """Test GPU availability."""
+    print("Testing GPU availability...")
+    if torch.cuda.is_available():
+        print(f"✓ CUDA available, {torch.cuda.device_count()} GPU(s) found")
+        print(f"  Current device: {torch.cuda.get_device_name(0)}")
+        return True
+    else:
+        print("⚠ CUDA not available, will run on CPU")
+        return False
+
+def test_environment():
+    """Test that we can create a simple environment."""
+    print("Testing environment creation...")
+    try:
+        env = gym.make("Pendulum-v1")
+        obs, info = env.reset()
+        print(f"✓ Environment created successfully")
+        print(f"  Observation space: {env.observation_space}")
+        print(f"  Action space: {env.action_space}")
+        env.close()
+        return True
+    except Exception as e:
+        print(f"✗ Environment creation failed: {e}")
+        return False
+
+def test_wandb_setup():
+    """Test wandb setup (without actual login)."""
+    print("Testing wandb setup...")
+    try:
+        # Just test that wandb can be initialized in offline mode
+        os.environ["WANDB_MODE"] = "offline"
+        wandb.init(project="test", mode="offline")
+        wandb.finish()
+        print("✓ wandb can be initialized")
+        return True
+    except Exception as e:
+        print(f"✗ wandb setup failed: {e}")
+        return False
+
+def main():
+    print("FastTD3 Setup Test")
+    print("==================")
+    
+    tests = [
+        test_basic_imports,
+        test_gpu_availability,
+        test_environment,
+        test_wandb_setup,
+    ]
+    
+    passed = 0
+    for test in tests:
+        if test():
+            passed += 1
+        print()
+    
+    print(f"Results: {passed}/{len(tests)} tests passed")
+    
+    if passed == len(tests):
+        print("🎉 All tests passed! Setup looks good.")
+        return True
+    else:
+        print("❌ Some tests failed. Check the output above.")
+        return False
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file