- Fixed JAX/PyTorch dtype mismatch for successful training - Added experiment plan with paper-accurate hyperparameters - Created batch submission and monitoring scripts - Cleaned up log files and updated gitignore - Ready for systematic paper replication
225 lines
7.4 KiB
Python
Executable File
225 lines
7.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Batch experiment submission script for FastTD3 paper replication.
|
|
Usage: python submit_experiment_batch.py --phase 1 --tasks all --seeds 3
|
|
"""
|
|
|
|
import os
|
|
import subprocess
|
|
import argparse
|
|
import time
|
|
import yaml
|
|
from pathlib import Path
|
|
|
|
# Experiment configuration based on paper
|
|
EXPERIMENT_CONFIG = {
|
|
"phase1_mujoco": {
|
|
"tasks": [
|
|
"T1JoystickFlatTerrain",
|
|
"T1JoystickRoughTerrain",
|
|
"G1JoystickFlatTerrain",
|
|
"G1JoystickRoughTerrain"
|
|
],
|
|
"total_timesteps": 500000,
|
|
"num_envs": 2048,
|
|
"batch_size": 32768,
|
|
"buffer_size": 102400, # 50K per env for 2048 envs
|
|
"eval_interval": 25000,
|
|
"time_limit": "04:00:00",
|
|
"mem": "64G"
|
|
},
|
|
"phase2_isaaclab": {
|
|
"tasks": [
|
|
"Isaac-Velocity-Flat-G1-v0",
|
|
"Isaac-Velocity-Rough-G1-v0",
|
|
"Isaac-Repose-Cube-Allegro-Direct-v0",
|
|
"Isaac-Repose-Cube-Shadow-Direct-v0",
|
|
"Isaac-Velocity-Flat-H1-v0",
|
|
"Isaac-Velocity-Rough-H1-v0"
|
|
],
|
|
"total_timesteps": 1000000,
|
|
"num_envs": 1024,
|
|
"batch_size": 32768,
|
|
"buffer_size": 51200,
|
|
"eval_interval": 50000,
|
|
"time_limit": "04:00:00",
|
|
"mem": "64G"
|
|
},
|
|
"phase3_humanoidbench": {
|
|
"tasks": [
|
|
"h1hand-walk",
|
|
"h1hand-run",
|
|
"h1hand-hurdle",
|
|
"h1hand-stair",
|
|
"h1hand-slide"
|
|
],
|
|
"total_timesteps": 2000000,
|
|
"num_envs": 256,
|
|
"batch_size": 16384,
|
|
"buffer_size": 12800,
|
|
"eval_interval": 100000,
|
|
"time_limit": "12:00:00", # 12 hours for HumanoidBench
|
|
"mem": "64G"
|
|
}
|
|
}
|
|
|
|
def create_job_script(task, config, seed, phase):
|
|
"""Create SLURM script for specific task/seed combination."""
|
|
|
|
script_content = f'''#!/bin/bash
|
|
#SBATCH --job-name=fasttd3_{phase}_{task.replace("-", "_")}_s{seed}
|
|
#SBATCH --account=hk-project-p0022232
|
|
#SBATCH --partition=accelerated
|
|
#SBATCH --time={config["time_limit"]}
|
|
#SBATCH --gres=gpu:1
|
|
#SBATCH --ntasks=1
|
|
#SBATCH --cpus-per-task=8
|
|
#SBATCH --mem={config["mem"]}
|
|
#SBATCH --output=logs/fasttd3_{phase}_{task.replace("-", "_")}_s{seed}_%j.out
|
|
#SBATCH --error=logs/fasttd3_{phase}_{task.replace("-", "_")}_s{seed}_%j.err
|
|
|
|
# Load necessary modules
|
|
module purge
|
|
module load devel/cuda/12.4
|
|
module load compiler/intel/2025.1_llvm
|
|
|
|
# Navigate to the project directory
|
|
cd $SLURM_SUBMIT_DIR
|
|
|
|
# Activate the virtual environment
|
|
source .venv/bin/activate
|
|
|
|
# Set environment variables
|
|
export CUDA_VISIBLE_DEVICES=$SLURM_LOCALID
|
|
export JAX_PLATFORMS="cuda"
|
|
export WANDB_MODE=online
|
|
|
|
echo "Starting FastTD3 {phase} training at $(date)"
|
|
echo "Task: {task}"
|
|
echo "Seed: {seed}"
|
|
echo "GPU: $CUDA_VISIBLE_DEVICES"
|
|
echo "Node: $(hostname)"
|
|
|
|
# Run FastTD3 training
|
|
python fast_td3/train.py \\
|
|
--env_name {task} \\
|
|
--exp_name FastTD3_{phase}_{task.replace("-", "_")} \\
|
|
--seed {seed} \\
|
|
--total_timesteps {config["total_timesteps"]} \\
|
|
--num_envs {config["num_envs"]} \\
|
|
--batch_size {config["batch_size"]} \\
|
|
--buffer_size {config["buffer_size"]} \\
|
|
--eval_interval {config["eval_interval"]} \\
|
|
--render_interval 0 \\
|
|
--project FastTD3_HoReKa_{phase.title()}
|
|
|
|
echo "Job completed at $(date)"
|
|
'''
|
|
|
|
return script_content
|
|
|
|
def submit_job(script_path, dry_run=False):
|
|
"""Submit SLURM job and return job ID."""
|
|
if dry_run:
|
|
print(f"[DRY RUN] Would submit: {script_path}")
|
|
return "12345" # Fake job ID
|
|
|
|
try:
|
|
result = subprocess.run(['sbatch', script_path],
|
|
capture_output=True, text=True, check=True)
|
|
job_id = result.stdout.strip().split()[-1]
|
|
print(f"✅ Submitted {script_path} -> Job ID: {job_id}")
|
|
return job_id
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"❌ Failed to submit {script_path}: {e.stderr}")
|
|
return None
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Submit FastTD3 experiment batch')
|
|
parser.add_argument('--phase', type=str, choices=['1', '2', '3', 'all'],
|
|
default='1', help='Experiment phase to run')
|
|
parser.add_argument('--tasks', type=str, default='all',
|
|
help='Comma-separated task names or "all"')
|
|
parser.add_argument('--seeds', type=int, default=3,
|
|
help='Number of random seeds to run')
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Print commands without executing')
|
|
parser.add_argument('--delay', type=int, default=5,
|
|
help='Delay between job submissions (seconds)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Create logs directory
|
|
os.makedirs('logs', exist_ok=True)
|
|
os.makedirs('scripts', exist_ok=True)
|
|
|
|
# Determine which phases to run
|
|
if args.phase == 'all':
|
|
phases = ['phase1_mujoco', 'phase2_isaaclab', 'phase3_humanoidbench']
|
|
else:
|
|
phase_map = {'1': 'phase1_mujoco', '2': 'phase2_isaaclab', '3': 'phase3_humanoidbench'}
|
|
phases = [phase_map[args.phase]]
|
|
|
|
submitted_jobs = []
|
|
|
|
for phase in phases:
|
|
config = EXPERIMENT_CONFIG[phase]
|
|
|
|
# Determine tasks to run
|
|
if args.tasks == 'all':
|
|
tasks = config['tasks']
|
|
else:
|
|
tasks = [t.strip() for t in args.tasks.split(',')]
|
|
# Validate tasks exist in config
|
|
invalid = set(tasks) - set(config['tasks'])
|
|
if invalid:
|
|
print(f"❌ Invalid tasks for {phase}: {invalid}")
|
|
continue
|
|
|
|
print(f"\\n🚀 Starting {phase} with tasks: {tasks}")
|
|
print(f" Seeds: {list(range(1, args.seeds + 1))}")
|
|
|
|
for task in tasks:
|
|
for seed in range(1, args.seeds + 1):
|
|
# Create job script
|
|
script_content = create_job_script(task, config, seed, phase)
|
|
script_name = f"scripts/fasttd3_{phase}_{task.replace('-', '_')}_s{seed}.slurm"
|
|
|
|
with open(script_name, 'w') as f:
|
|
f.write(script_content)
|
|
|
|
# Submit job
|
|
job_id = submit_job(script_name, args.dry_run)
|
|
if job_id:
|
|
submitted_jobs.append({
|
|
'job_id': job_id,
|
|
'phase': phase,
|
|
'task': task,
|
|
'seed': seed,
|
|
'script': script_name
|
|
})
|
|
|
|
# Delay between submissions to avoid overwhelming scheduler
|
|
if not args.dry_run and args.delay > 0:
|
|
time.sleep(args.delay)
|
|
|
|
# Summary
|
|
print(f"\\n📊 Submission Summary:")
|
|
print(f" Total jobs submitted: {len(submitted_jobs)}")
|
|
|
|
if submitted_jobs:
|
|
# Save job tracking info
|
|
tracking_file = f"experiment_tracking_{int(time.time())}.yaml"
|
|
with open(tracking_file, 'w') as f:
|
|
yaml.dump({
|
|
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
|
|
'jobs': submitted_jobs
|
|
}, f, default_flow_style=False)
|
|
|
|
print(f" Job tracking saved to: {tracking_file}")
|
|
print(f"\\n💡 Monitor progress with:")
|
|
print(f" squeue -u $USER")
|
|
print(f" python monitor_experiments.py {tracking_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |