FastTD3/submit_experiment_batch.py
2025-07-29 19:43:31 +02:00

227 lines
7.4 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Batch experiment submission script for FastTD3 paper replication.
Usage: python submit_experiment_batch.py --phase 1 --tasks all --seeds 3
"""
import os
import subprocess
import argparse
import time
import yaml
from pathlib import Path
# Experiment configuration based on paper
EXPERIMENT_CONFIG = {
"phase1_mujoco": {
"tasks": [
"T1JoystickFlatTerrain",
"T1JoystickRoughTerrain",
"G1JoystickFlatTerrain",
"G1JoystickRoughTerrain"
],
"total_timesteps": 500000,
"num_envs": 2048,
"batch_size": 32768,
"buffer_size": 102400, # 50K per env for 2048 envs
"eval_interval": 25000,
"time_limit": "04:00:00",
"mem": "64G"
},
"phase2_isaaclab": {
"tasks": [
"Isaac-Velocity-Flat-G1-v0",
"Isaac-Velocity-Rough-G1-v0",
"Isaac-Repose-Cube-Allegro-Direct-v0",
"Isaac-Repose-Cube-Shadow-Direct-v0",
"Isaac-Velocity-Flat-H1-v0",
"Isaac-Velocity-Rough-H1-v0"
],
"total_timesteps": 1000000,
"num_envs": 1024,
"batch_size": 32768,
"buffer_size": 51200,
"eval_interval": 50000,
"time_limit": "04:00:00",
"mem": "64G"
},
"phase3_humanoidbench": {
"tasks": [
"h1hand-walk",
"h1hand-run",
"h1hand-hurdle",
"h1hand-stair",
"h1hand-slide"
],
"total_timesteps": 2000000,
"num_envs": 256,
"batch_size": 16384,
"buffer_size": 12800,
"eval_interval": 100000,
"time_limit": "12:00:00", # 12 hours for HumanoidBench
"mem": "64G"
}
}
def create_job_script(task, config, seed, phase):
"""Create SLURM script for specific task/seed combination."""
script_content = f'''#!/bin/bash
#SBATCH --job-name=fasttd3_{phase}_{task.replace("-", "_")}_s{seed}
#SBATCH --account=hk-project-p0022232
#SBATCH --partition=accelerated-h100
#SBATCH --time={config["time_limit"]}
#SBATCH --gres=gpu:1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --mem={config["mem"]}
#SBATCH --output=logs/fasttd3_{phase}_{task.replace("-", "_")}_s{seed}_%j.out
#SBATCH --error=logs/fasttd3_{phase}_{task.replace("-", "_")}_s{seed}_%j.err
# Load necessary modules
module purge
module load devel/cuda/12.4
# Navigate to the project directory
cd $SLURM_SUBMIT_DIR
# Set up Python environment
source ~/.bashrc # Load PATH and LD_LIBRARY_PATH for custom Python
# Activate the virtual environment
source .venv/bin/activate
# Set environment variables
export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID
export JAX_PLATFORMS="cuda"
export WANDB_MODE=online
echo "Starting FastTD3 {phase} training at $(date)"
echo "Task: {task}"
echo "Seed: {seed}"
echo "GPU: $CUDA_VISIBLE_DEVICES"
echo "Node: $(hostname)"
# Run FastTD3 training
python fast_td3/train.py \\
--env_name {task} \\
--exp_name FastTD3_{phase}_{task.replace("-", "_")} \\
--seed {seed} \\
--total_timesteps {config["total_timesteps"]} \\
--num_envs {config["num_envs"]} \\
--batch_size {config["batch_size"]} \\
--buffer_size {config["buffer_size"]} \\
--eval_interval {config["eval_interval"]} \\
--render_interval 0 \\
--project FastTD3_HoReKa_{phase.title()}
echo "Job completed at $(date)"
'''
return script_content
def submit_job(script_path, dry_run=False):
"""Submit SLURM job and return job ID."""
if dry_run:
print(f"[DRY RUN] Would submit: {script_path}")
return "12345" # Fake job ID
try:
result = subprocess.run(['sbatch', script_path],
capture_output=True, text=True, check=True)
job_id = result.stdout.strip().split()[-1]
print(f"✅ Submitted {script_path} -> Job ID: {job_id}")
return job_id
except subprocess.CalledProcessError as e:
print(f"❌ Failed to submit {script_path}: {e.stderr}")
return None
def main():
parser = argparse.ArgumentParser(description='Submit FastTD3 experiment batch')
parser.add_argument('--phase', type=str, choices=['1', '2', '3', 'all'],
default='1', help='Experiment phase to run')
parser.add_argument('--tasks', type=str, default='all',
help='Comma-separated task names or "all"')
parser.add_argument('--seeds', type=int, default=3,
help='Number of random seeds to run')
parser.add_argument('--dry-run', action='store_true',
help='Print commands without executing')
parser.add_argument('--delay', type=int, default=5,
help='Delay between job submissions (seconds)')
args = parser.parse_args()
# Create logs directory
os.makedirs('logs', exist_ok=True)
os.makedirs('scripts', exist_ok=True)
# Determine which phases to run
if args.phase == 'all':
phases = ['phase1_mujoco', 'phase2_isaaclab', 'phase3_humanoidbench']
else:
phase_map = {'1': 'phase1_mujoco', '2': 'phase2_isaaclab', '3': 'phase3_humanoidbench'}
phases = [phase_map[args.phase]]
submitted_jobs = []
for phase in phases:
config = EXPERIMENT_CONFIG[phase]
# Determine tasks to run
if args.tasks == 'all':
tasks = config['tasks']
else:
tasks = [t.strip() for t in args.tasks.split(',')]
# Validate tasks exist in config
invalid = set(tasks) - set(config['tasks'])
if invalid:
print(f"❌ Invalid tasks for {phase}: {invalid}")
continue
print(f"\\n🚀 Starting {phase} with tasks: {tasks}")
print(f" Seeds: {list(range(1, args.seeds + 1))}")
for task in tasks:
for seed in range(1, args.seeds + 1):
# Create job script
script_content = create_job_script(task, config, seed, phase)
script_name = f"scripts/fasttd3_{phase}_{task.replace('-', '_')}_s{seed}.slurm"
with open(script_name, 'w') as f:
f.write(script_content)
# Submit job
job_id = submit_job(script_name, args.dry_run)
if job_id:
submitted_jobs.append({
'job_id': job_id,
'phase': phase,
'task': task,
'seed': seed,
'script': script_name
})
# Delay between submissions to avoid overwhelming scheduler
if not args.dry_run and args.delay > 0:
time.sleep(args.delay)
# Summary
print(f"\\n📊 Submission Summary:")
print(f" Total jobs submitted: {len(submitted_jobs)}")
if submitted_jobs:
# Save job tracking info
tracking_file = f"experiment_tracking_{int(time.time())}.yaml"
with open(tracking_file, 'w') as f:
yaml.dump({
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
'jobs': submitted_jobs
}, f, default_flow_style=False)
print(f" Job tracking saved to: {tracking_file}")
print(f"\\n💡 Monitor progress with:")
print(f" squeue -u $USER")
print(f" python monitor_experiments.py {tracking_file}")
if __name__ == "__main__":
main()