Add HoReKa cluster support with SLURM and wandb integration

- Add complete HoReKa installation guide without conda dependency - Include SLURM job script with GPU configuration and account setup - Add helper scripts for job submission and environment testing - Integrate wandb logging with both online and offline modes - Support MuJoCo Playground environments for humanoid control - Update README with clear separation of added vs original content
2025-07-22 16:15:30 +02:00 · 2025-07-22 16:15:30 +02:00 · 336c96bb7b
commit 336c96bb7b
parent 51c55d4a8a
4 changed files with 351 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -8,6 +8,103 @@ FastTD3 is a high-performance variant of the Twin Delayed Deep Deterministic Pol
 For more information, please see our [project webpage](https://younggyo.me/fast_td3)
 ## 🏔️ HoReKa Cluster Setup
 *Added by Dominik - Custom setup for HoReKa supercomputer*
 **Quick Setup for HoReKa Users:**
 This repository includes optimized scripts for running FastTD3 on the HoReKa supercomputer cluster with SLURM job scheduling and wandb logging.
 ### Installation on HoReKa
 ```bash
 # Clone the repository
 git clone https://github.com/younggyoseo/FastTD3.git
 cd FastTD3
 # Install Python 3.10 locally (HoReKa doesn't provide conda)
 mkdir -p $HOME/.local/python-3.10
 cd /tmp
 curl -O https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz
 tar -xzf Python-3.10.14.tgz
 cd Python-3.10.14
 ./configure --prefix=$HOME/.local/python-3.10 --enable-optimizations --with-ensurepip=install
 make -j$(nproc)
 make install
 # Add to PATH
 echo 'export PATH="$HOME/.local/python-3.10/bin:$PATH"' >> ~/.bashrc
 echo 'export PATH="$HOME/.local/python-3.10/bin:$PATH"' >> ~/.zshrc
 export PATH="$HOME/.local/python-3.10/bin:$PATH"
 # Go back to FastTD3 directory
 cd $HOME/path/to/FastTD3
 # Create virtual environment and install dependencies
 $HOME/.local/python-3.10/bin/python3.10 -m venv .venv
 source .venv/bin/activate
 pip install --upgrade pip
 pip install -r requirements/requirements.txt
 pip install git+https://github.com/younggyoseo/mujoco_playground.git
 # Test installation
 python test_setup.py
 ```
 ### Running on HoReKa
 **Easy submission:**
 ```bash
 python submit_job.py
 ```
 **Manual submission:**
 ```bash
 sbatch run_fasttd3.slurm
 ```
 **Monitor jobs:**
 ```bash
 # Check job status
 squeue -u $USER
 # View output
 tail -f fasttd3_<job_id>.out
 # Cancel job if needed
 scancel <job_id>
 ```
 ### Configuration
 The setup includes:
 - **SLURM script** (`run_fasttd3.slurm`) configured for accelerated partition with GPU
 - **Job helper** (`submit_job.py`) for easy job submission with wandb setup
 - **Test script** (`test_setup.py`) for environment verification
 - **MuJoCo Playground environment** (`T1JoystickFlatTerrain`) for humanoid control
 - **Automatic GPU detection** and CUDA 12.4 compatibility
 - **Wandb logging** with offline mode support
 ### Wandb Integration
 The scripts support both online and offline wandb logging:
 **Online mode:**
 ```bash
 export WANDB_API_KEY=your_api_key_here
 python submit_job.py
 # Select 'y' when prompted for online mode
 ```
 **Offline mode (default):**
 ```bash
 # Jobs run in offline mode by default
 # Sync later with: wandb sync <run_directory>
 ```
 ---
 # ORIGINAL README:
 ## ❗ Updates
--- a/run_fasttd3.slurm
+++ b/run_fasttd3.slurm
@ -0,0 +1,44 @@
 #!/bin/bash
 #SBATCH --job-name=fasttd3_test
 #SBATCH --account=hk-project-p0022232
 #SBATCH --partition=accelerated
 #SBATCH --time=02:00:00
 #SBATCH --gres=gpu:1
 #SBATCH --ntasks=1
 #SBATCH --cpus-per-task=8
 #SBATCH --mem=32G
 #SBATCH --output=fasttd3_%j.out
 #SBATCH --error=fasttd3_%j.err
 # Load necessary modules
 module purge
 module load toolkit/CUDA/12.4
 # Navigate to the project directory
 cd $SLURM_SUBMIT_DIR
 # Activate the virtual environment
 source .venv/bin/activate
 # Set environment variables for proper GPU usage
 export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID
 export JAX_PLATFORMS="gpu,cpu"
 # Ensure wandb is logged in (set WANDB_API_KEY environment variable)
 # export WANDB_API_KEY=your_api_key_here
 # For testing, use offline mode
 export WANDB_MODE=offline
 # Run FastTD3 training with MuJoCo Playground environment
 python fast_td3/train.py \
    --env_name T1JoystickFlatTerrain \
    --exp_name FastTD3_HoReKa_Test \
    --seed 42 \
    --total_timesteps 25000 \
    --num_envs 1024 \
    --batch_size 4096 \
    --eval_interval 5000 \
    --render_interval 0 \
    --project FastTD3_HoReKa
 echo "Job completed at $(date)"
--- a/submit_job.py
+++ b/submit_job.py
@ -0,0 +1,116 @@
 #!/usr/bin/env python3
 """
 Helper script to submit FastTD3 jobs to SLURM with proper wandb setup.
 """
 import os
 import subprocess
 import sys
 def check_wandb_setup():
    """Check if wandb is properly configured."""
    try:
        import wandb
        # Try to initialize in offline mode to test setup
        wandb.init(mode="offline")
        wandb.finish()
        print("✓ wandb is properly installed")
        return True
    except Exception as e:
        print(f"✗ wandb setup issue: {e}")
        return False
 def check_environment():
    """Check if we're in the right environment and directory."""
    if not os.path.exists('.venv'):
        print("✗ Virtual environment not found. Run from the FastTD3 directory.")
        return False
    if not os.path.exists('fast_td3/train.py'):
        print("✗ FastTD3 training script not found. Run from the FastTD3 directory.")
        return False
    print("✓ Environment looks good")
    return True
 def submit_job(script_path="run_fasttd3.slurm", use_wandb_online=False):
    """Submit the SLURM job."""
    if not os.path.exists(script_path):
        print(f"✗ SLURM script {script_path} not found")
        return False
    print(f"Submitting job with script: {script_path}")
    # If using online wandb, prompt for API key
    if use_wandb_online:
        api_key = input("Enter your wandb API key (or press Enter to skip): ").strip()
        if api_key:
            # Update the script to set the API key
            with open(script_path, 'r') as f:
                content = f.read()
            # Replace the commented API key line
            content = content.replace(
                "# export WANDB_API_KEY=your_api_key_here",
                f"export WANDB_API_KEY={api_key}"
            )
            # Remove offline mode
            content = content.replace(
                "export WANDB_MODE=offline",
                "# export WANDB_MODE=offline  # Using online mode"
            )
            with open(script_path, 'w') as f:
                f.write(content)
            print("✓ Updated script with wandb API key")
    try:
        result = subprocess.run(['sbatch', script_path], 
                              capture_output=True, text=True)
        if result.returncode == 0:
            print(f"✓ Job submitted successfully:")
            print(result.stdout.strip())
            job_id = result.stdout.strip().split()[-1]
            print(f"\nTo monitor the job:")
            print(f"  squeue -j {job_id}")
            print(f"  tail -f fasttd3_{job_id}.out")
            return True
        else:
            print(f"✗ Job submission failed:")
            print(result.stderr.strip())
            return False
    except FileNotFoundError:
        print("✗ sbatch command not found. Are you on a SLURM cluster?")
        return False
    except Exception as e:
        print(f"✗ Error submitting job: {e}")
        return False
 def main():
    print("FastTD3 Job Submission Helper")
    print("=" * 30)
    # Check environment
    if not check_environment():
        sys.exit(1)
    if not check_wandb_setup():
        sys.exit(1)
    # Ask user about wandb mode
    use_online = input("Use wandb online mode? (y/N): ").lower().startswith('y')
    # Submit job
    if submit_job(use_wandb_online=use_online):
        print("\n🎉 Job submitted successfully!")
        print("\nTips:")
        print("- Check job status: squeue -u $USER")
        print("- View output: tail -f fasttd3_<jobid>.out")
        print("- Cancel job: scancel <jobid>")
        if not use_online:
            print("- Job runs in wandb offline mode. Sync later with: wandb sync <run_dir>")
    else:
        print("\n❌ Job submission failed")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/test_setup.py
+++ b/test_setup.py
@ -0,0 +1,94 @@
 #!/usr/bin/env python3
 """
 Test script to verify FastTD3 setup is working correctly.
 This runs a minimal test to ensure all components are functioning.
 """
 import os
 import torch
 import gymnasium as gym
 import wandb
 from fast_td3.hyperparams import get_args
 def test_basic_imports():
    """Test that all required packages can be imported."""
    print("Testing basic imports...")
    try:
        import torch
        import gymnasium as gym
        import wandb
        import numpy as np
        import tensordict
        print("✓ All basic packages imported successfully")
        return True
    except ImportError as e:
        print(f"✗ Import error: {e}")
        return False
 def test_gpu_availability():
    """Test GPU availability."""
    print("Testing GPU availability...")
    if torch.cuda.is_available():
        print(f"✓ CUDA available, {torch.cuda.device_count()} GPU(s) found")
        print(f"  Current device: {torch.cuda.get_device_name(0)}")
        return True
    else:
        print("⚠ CUDA not available, will run on CPU")
        return False
 def test_environment():
    """Test that we can create a simple environment."""
    print("Testing environment creation...")
    try:
        env = gym.make("Pendulum-v1")
        obs, info = env.reset()
        print(f"✓ Environment created successfully")
        print(f"  Observation space: {env.observation_space}")
        print(f"  Action space: {env.action_space}")
        env.close()
        return True
    except Exception as e:
        print(f"✗ Environment creation failed: {e}")
        return False
 def test_wandb_setup():
    """Test wandb setup (without actual login)."""
    print("Testing wandb setup...")
    try:
        # Just test that wandb can be initialized in offline mode
        os.environ["WANDB_MODE"] = "offline"
        wandb.init(project="test", mode="offline")
        wandb.finish()
        print("✓ wandb can be initialized")
        return True
    except Exception as e:
        print(f"✗ wandb setup failed: {e}")
        return False
 def main():
    print("FastTD3 Setup Test")
    print("==================")
    tests = [
        test_basic_imports,
        test_gpu_availability,
        test_environment,
        test_wandb_setup,
    ]
    passed = 0
    for test in tests:
        if test():
            passed += 1
        print()
    print(f"Results: {passed}/{len(tests)} tests passed")
    if passed == len(tests):
        print("🎉 All tests passed! Setup looks good.")
        return True
    else:
        print("❌ Some tests failed. Check the output above.")
        return False
 if __name__ == "__main__":
    main()