#!/usr/bin/env python3 """ Batch experiment submission script for FastTD3 paper replication. Usage: python submit_experiment_batch.py --phase 1 --tasks all --seeds 3 """ import os import subprocess import argparse import time import yaml from pathlib import Path # Experiment configuration based on paper EXPERIMENT_CONFIG = { "phase1_mujoco": { "tasks": [ "T1JoystickFlatTerrain", "T1JoystickRoughTerrain", "G1JoystickFlatTerrain", "G1JoystickRoughTerrain" ], "total_timesteps": 500000, "num_envs": 2048, "batch_size": 32768, "buffer_size": 102400, # 50K per env for 2048 envs "eval_interval": 25000, "time_limit": "04:00:00", "mem": "64G" }, "phase2_isaaclab": { "tasks": [ "Isaac-Velocity-Flat-G1-v0", "Isaac-Velocity-Rough-G1-v0", "Isaac-Repose-Cube-Allegro-Direct-v0", "Isaac-Repose-Cube-Shadow-Direct-v0", "Isaac-Velocity-Flat-H1-v0", "Isaac-Velocity-Rough-H1-v0" ], "total_timesteps": 1000000, "num_envs": 1024, "batch_size": 32768, "buffer_size": 51200, "eval_interval": 50000, "time_limit": "04:00:00", "mem": "64G" }, "phase3_humanoidbench": { "tasks": [ "h1hand-walk", "h1hand-run", "h1hand-hurdle", "h1hand-stair", "h1hand-slide" ], "total_timesteps": 2000000, "num_envs": 256, "batch_size": 16384, "buffer_size": 12800, "eval_interval": 100000, "time_limit": "12:00:00", # 12 hours for HumanoidBench "mem": "64G" } } def create_job_script(task, config, seed, phase): """Create SLURM script for specific task/seed combination.""" script_content = f'''#!/bin/bash #SBATCH --job-name=fasttd3_{phase}_{task.replace("-", "_")}_s{seed} #SBATCH --account=hk-project-p0022232 #SBATCH --partition=accelerated-h100 #SBATCH --time={config["time_limit"]} #SBATCH --gres=gpu:1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=8 #SBATCH --mem={config["mem"]} #SBATCH --output=logs/fasttd3_{phase}_{task.replace("-", "_")}_s{seed}_%j.out #SBATCH --error=logs/fasttd3_{phase}_{task.replace("-", "_")}_s{seed}_%j.err # Load necessary modules module purge module load devel/cuda/12.4 # Navigate to the project directory cd $SLURM_SUBMIT_DIR # Set up Python environment source ~/.bashrc # Load PATH and LD_LIBRARY_PATH for custom Python # Activate the virtual environment source .venv/bin/activate # Set environment variables export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID export JAX_PLATFORMS="cuda" export WANDB_MODE=online echo "Starting FastTD3 {phase} training at $(date)" echo "Task: {task}" echo "Seed: {seed}" echo "GPU: $CUDA_VISIBLE_DEVICES" echo "Node: $(hostname)" # Run FastTD3 training python fast_td3/train.py \\ --env_name {task} \\ --exp_name FastTD3_{phase}_{task.replace("-", "_")} \\ --seed {seed} \\ --total_timesteps {config["total_timesteps"]} \\ --num_envs {config["num_envs"]} \\ --batch_size {config["batch_size"]} \\ --buffer_size {config["buffer_size"]} \\ --eval_interval {config["eval_interval"]} \\ --render_interval 0 \\ --project FastTD3_HoReKa_{phase.title()} echo "Job completed at $(date)" ''' return script_content def submit_job(script_path, dry_run=False): """Submit SLURM job and return job ID.""" if dry_run: print(f"[DRY RUN] Would submit: {script_path}") return "12345" # Fake job ID try: result = subprocess.run(['sbatch', script_path], capture_output=True, text=True, check=True) job_id = result.stdout.strip().split()[-1] print(f"āœ… Submitted {script_path} -> Job ID: {job_id}") return job_id except subprocess.CalledProcessError as e: print(f"āŒ Failed to submit {script_path}: {e.stderr}") return None def main(): parser = argparse.ArgumentParser(description='Submit FastTD3 experiment batch') parser.add_argument('--phase', type=str, choices=['1', '2', '3', 'all'], default='1', help='Experiment phase to run') parser.add_argument('--tasks', type=str, default='all', help='Comma-separated task names or "all"') parser.add_argument('--seeds', type=int, default=3, help='Number of random seeds to run') parser.add_argument('--dry-run', action='store_true', help='Print commands without executing') parser.add_argument('--delay', type=int, default=5, help='Delay between job submissions (seconds)') args = parser.parse_args() # Create logs directory os.makedirs('logs', exist_ok=True) os.makedirs('scripts', exist_ok=True) # Determine which phases to run if args.phase == 'all': phases = ['phase1_mujoco', 'phase2_isaaclab', 'phase3_humanoidbench'] else: phase_map = {'1': 'phase1_mujoco', '2': 'phase2_isaaclab', '3': 'phase3_humanoidbench'} phases = [phase_map[args.phase]] submitted_jobs = [] for phase in phases: config = EXPERIMENT_CONFIG[phase] # Determine tasks to run if args.tasks == 'all': tasks = config['tasks'] else: tasks = [t.strip() for t in args.tasks.split(',')] # Validate tasks exist in config invalid = set(tasks) - set(config['tasks']) if invalid: print(f"āŒ Invalid tasks for {phase}: {invalid}") continue print(f"\\nšŸš€ Starting {phase} with tasks: {tasks}") print(f" Seeds: {list(range(1, args.seeds + 1))}") for task in tasks: for seed in range(1, args.seeds + 1): # Create job script script_content = create_job_script(task, config, seed, phase) script_name = f"scripts/fasttd3_{phase}_{task.replace('-', '_')}_s{seed}.slurm" with open(script_name, 'w') as f: f.write(script_content) # Submit job job_id = submit_job(script_name, args.dry_run) if job_id: submitted_jobs.append({ 'job_id': job_id, 'phase': phase, 'task': task, 'seed': seed, 'script': script_name }) # Delay between submissions to avoid overwhelming scheduler if not args.dry_run and args.delay > 0: time.sleep(args.delay) # Summary print(f"\\nšŸ“Š Submission Summary:") print(f" Total jobs submitted: {len(submitted_jobs)}") if submitted_jobs: # Save job tracking info tracking_file = f"experiment_tracking_{int(time.time())}.yaml" with open(tracking_file, 'w') as f: yaml.dump({ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'), 'jobs': submitted_jobs }, f, default_flow_style=False) print(f" Job tracking saved to: {tracking_file}") print(f"\\nšŸ’” Monitor progress with:") print(f" squeue -u $USER") print(f" python monitor_experiments.py {tracking_file}") if __name__ == "__main__": main()