dppo/slurm/run_dppo_dev_flexible.sh
ys1087@partner.kit.edu 0424a080c1 feat: HoReKa cluster adaptation and validation
- Updated all WandB project names to use dppo- prefix for organization
- Added flexible dev testing script for all environments
- Created organized dev_tests directory for test scripts
- Fixed MuJoCo compilation issues (added GCC compiler flags)
- Documented Python 3.10 compatibility and Furniture-Bench limitation
- Validated pre-training for Gym, Robomimic, D3IL environments
- Updated experiment tracking with validation results
- Enhanced README with troubleshooting and setup instructions
2025-08-27 14:01:51 +02:00

100 lines
2.8 KiB
Bash

#!/bin/bash
#SBATCH --job-name=dppo_dev_test
#SBATCH --account=hk-project-p0022232
#SBATCH --partition=dev_accelerated
#SBATCH --gres=gpu:1
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --time=00:30:00
#SBATCH --mem=24G
#SBATCH --output=logs/dppo_dev_%j.out
#SBATCH --error=logs/dppo_dev_%j.err
# Usage: TASK=hopper MODE=pretrain sbatch slurm/run_dppo_dev_flexible.sh
# Usage: TASK=hopper MODE=finetune sbatch slurm/run_dppo_dev_flexible.sh
# Load required modules
module load devel/cuda/12.4
# Fix MuJoCo library path for fine-tuning
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib/nvidia
# Use GCC instead of Intel compiler for MuJoCo compilation (Intel icx too strict)
export CC=gcc
export CXX=g++
# Set environment variables for WandB
export WANDB_MODE=online
export DPPO_WANDB_ENTITY=${DPPO_WANDB_ENTITY:-"dominik_roth"}
# Default paths
export DPPO_DATA_DIR=${DPPO_DATA_DIR:-$SLURM_SUBMIT_DIR/data}
export DPPO_LOG_DIR=${DPPO_LOG_DIR:-$SLURM_SUBMIT_DIR/log}
# Set defaults if not provided
TASK=${TASK:-hopper}
MODE=${MODE:-pretrain}
# Change to project directory
cd $SLURM_SUBMIT_DIR
# Activate virtual environment
source .venv/bin/activate
echo "Starting DPPO dev test..."
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "Task: $TASK"
echo "Mode: $MODE"
echo "GPU: $CUDA_VISIBLE_DEVICES"
echo ""
echo "Python version: $(python --version)"
echo "PyTorch version: $(python -c 'import torch; print(torch.__version__)')"
echo "CUDA available: $(python -c 'import torch; print(torch.cuda.is_available())')"
echo ""
if [ "$MODE" = "pretrain" ]; then
echo "Running pre-training test (2 epochs)..."
if [ "$TASK" = "hopper" ]; then
ENV_CONFIG="hopper-medium-v2"
elif [ "$TASK" = "walker2d" ]; then
ENV_CONFIG="walker2d-medium-v2"
elif [ "$TASK" = "halfcheetah" ]; then
ENV_CONFIG="halfcheetah-medium-v2"
else
echo "Unknown task: $TASK"
exit 1
fi
python script/run.py --config-name=pre_diffusion_mlp \
--config-dir=cfg/gym/pretrain/$ENV_CONFIG \
train.n_epochs=2 \
train.save_model_freq=1
elif [ "$MODE" = "finetune" ]; then
echo "Running fine-tuning test (short run)..."
if [ "$TASK" = "hopper" ]; then
ENV_CONFIG="hopper-v2"
elif [ "$TASK" = "walker2d" ]; then
ENV_CONFIG="walker2d-v2"
elif [ "$TASK" = "halfcheetah" ]; then
ENV_CONFIG="halfcheetah-v2"
else
echo "Unknown task: $TASK"
exit 1
fi
python script/run.py --config-name=ft_ppo_diffusion_mlp \
--config-dir=cfg/gym/finetune/$ENV_CONFIG \
train.n_train_itr=10 \
train.val_freq=5
else
echo "Unknown mode: $MODE. Use 'pretrain' or 'finetune'"
exit 1
fi
echo "Dev test completed!"