FastTD3/submit_experiment_batch.py

#!/usr/bin/env python3
"""
Batch experiment submission script for FastTD3 paper replication.
Usage: python submit_experiment_batch.py --phase 1 --tasks all --seeds 3
"""

import os
import subprocess
import argparse
import time
import yaml
from pathlib import Path

# Experiment configuration based on paper
EXPERIMENT_CONFIG = {
    "phase1_mujoco": {
        "tasks": [
            "T1JoystickFlatTerrain",
            "T1JoystickRoughTerrain",
            "G1JoystickFlatTerrain",
            "G1JoystickRoughTerrain"
        ],
        "total_timesteps": 500000,
        "num_envs": 2048,
        "batch_size": 32768,
        "buffer_size": 102400,  # 50K per env for 2048 envs
        "eval_interval": 25000,
        "time_limit": "04:00:00",
        "mem": "64G"
    },
    "phase2_isaaclab": {
        "tasks": [
            "Isaac-Velocity-Flat-G1-v0",
            "Isaac-Velocity-Rough-G1-v0",
            "Isaac-Repose-Cube-Allegro-Direct-v0",
            "Isaac-Repose-Cube-Shadow-Direct-v0",
            "Isaac-Velocity-Flat-H1-v0",
            "Isaac-Velocity-Rough-H1-v0"
        ],
        "total_timesteps": 1000000,
        "num_envs": 1024,
        "batch_size": 32768,
        "buffer_size": 51200,
        "eval_interval": 50000,
        "time_limit": "04:00:00",
        "mem": "64G"
    },
    "phase3_humanoidbench": {
        "tasks": [
            "h1hand-walk",
            "h1hand-run",
            "h1hand-hurdle",
            "h1hand-stair",
            "h1hand-slide"
        ],
        "total_timesteps": 2000000,
        "num_envs": 256,
        "batch_size": 16384,
        "buffer_size": 12800,
        "eval_interval": 100000,
        "time_limit": "12:00:00",  # 12 hours for HumanoidBench
        "mem": "64G"
    }
}

def create_job_script(task, config, seed, phase):
    """Create SLURM script for specific task/seed combination."""

    script_content = f'''#!/bin/bash
#SBATCH --job-name=fasttd3_{phase}_{task.replace("-", "_")}_s{seed}
#SBATCH --account=hk-project-p0022232
#SBATCH --partition=accelerated
#SBATCH --time={config["time_limit"]}
#SBATCH --gres=gpu:1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --mem={config["mem"]}
#SBATCH --output=logs/fasttd3_{phase}_{task.replace("-", "_")}_s{seed}_%j.out
#SBATCH --error=logs/fasttd3_{phase}_{task.replace("-", "_")}_s{seed}_%j.err

# Load necessary modules
module purge
module load devel/cuda/12.4
module load compiler/intel/2025.1_llvm

# Navigate to the project directory
cd $SLURM_SUBMIT_DIR

# Activate the virtual environment
source .venv/bin/activate

# Set environment variables
export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID
export JAX_PLATFORMS="cuda"
export WANDB_MODE=online

echo "Starting FastTD3 {phase} training at $(date)"
echo "Task: {task}"
echo "Seed: {seed}"
echo "GPU: $CUDA_VISIBLE_DEVICES"
echo "Node: $(hostname)"

# Run FastTD3 training
python fast_td3/train.py \\
    --env_name {task} \\
    --exp_name FastTD3_{phase}_{task.replace("-", "_")} \\
    --seed {seed} \\
    --total_timesteps {config["total_timesteps"]} \\
    --num_envs {config["num_envs"]} \\
    --batch_size {config["batch_size"]} \\
    --buffer_size {config["buffer_size"]} \\
    --eval_interval {config["eval_interval"]} \\
    --render_interval 0 \\
    --project FastTD3_HoReKa_{phase.title()}

echo "Job completed at $(date)"
'''

    return script_content

def submit_job(script_path, dry_run=False):
    """Submit SLURM job and return job ID."""
    if dry_run:
        print(f"[DRY RUN] Would submit: {script_path}")
        return "12345"  # Fake job ID

    try:
        result = subprocess.run(['sbatch', script_path],
                              capture_output=True, text=True, check=True)
        job_id = result.stdout.strip().split()[-1]
        print(f"✅ Submitted {script_path} -> Job ID: {job_id}")
        return job_id
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to submit {script_path}: {e.stderr}")
        return None

def main():
    parser = argparse.ArgumentParser(description='Submit FastTD3 experiment batch')
    parser.add_argument('--phase', type=str, choices=['1', '2', '3', 'all'],
                        default='1', help='Experiment phase to run')
    parser.add_argument('--tasks', type=str, default='all',
                        help='Comma-separated task names or "all"')
    parser.add_argument('--seeds', type=int, default=3,
                        help='Number of random seeds to run')
    parser.add_argument('--dry-run', action='store_true',
                        help='Print commands without executing')
    parser.add_argument('--delay', type=int, default=5,
                        help='Delay between job submissions (seconds)')

    args = parser.parse_args()

    # Create logs directory
    os.makedirs('logs', exist_ok=True)
    os.makedirs('scripts', exist_ok=True)

    # Determine which phases to run
    if args.phase == 'all':
        phases = ['phase1_mujoco', 'phase2_isaaclab', 'phase3_humanoidbench']
    else:
        phase_map = {'1': 'phase1_mujoco', '2': 'phase2_isaaclab', '3': 'phase3_humanoidbench'}
        phases = [phase_map[args.phase]]

    submitted_jobs = []

    for phase in phases:
        config = EXPERIMENT_CONFIG[phase]

        # Determine tasks to run
        if args.tasks == 'all':
            tasks = config['tasks']
        else:
            tasks = [t.strip() for t in args.tasks.split(',')]
            # Validate tasks exist in config
            invalid = set(tasks) - set(config['tasks'])
            if invalid:
                print(f"❌ Invalid tasks for {phase}: {invalid}")
                continue

        print(f"\\n🚀 Starting {phase} with tasks: {tasks}")
        print(f"   Seeds: {list(range(1, args.seeds + 1))}")

        for task in tasks:
            for seed in range(1, args.seeds + 1):
                # Create job script
                script_content = create_job_script(task, config, seed, phase)
                script_name = f"scripts/fasttd3_{phase}_{task.replace('-', '_')}_s{seed}.slurm"

                with open(script_name, 'w') as f:
                    f.write(script_content)

                # Submit job
                job_id = submit_job(script_name, args.dry_run)
                if job_id:
                    submitted_jobs.append({
                        'job_id': job_id,
                        'phase': phase,
                        'task': task,
                        'seed': seed,
                        'script': script_name
                    })

                # Delay between submissions to avoid overwhelming scheduler
                if not args.dry_run and args.delay > 0:
                    time.sleep(args.delay)

    # Summary
    print(f"\\n📊 Submission Summary:")
    print(f"   Total jobs submitted: {len(submitted_jobs)}")

    if submitted_jobs:
        # Save job tracking info
        tracking_file = f"experiment_tracking_{int(time.time())}.yaml"
        with open(tracking_file, 'w') as f:
            yaml.dump({
                'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
                'jobs': submitted_jobs
            }, f, default_flow_style=False)

        print(f"   Job tracking saved to: {tracking_file}")
        print(f"\\n💡 Monitor progress with:")
        print(f"   squeue -u $USER")
        print(f"   python monitor_experiments.py {tracking_file}")

if __name__ == "__main__":
    main()