From 05dddfa10c1cb275a732298f555b99b6a8cc16c2 Mon Sep 17 00:00:00 2001 From: "ys1087@partner.kit.edu" Date: Wed, 27 Aug 2025 11:57:32 +0200 Subject: [PATCH] Add HoReKa cluster setup and SLURM scripts - Add installation script for HoReKa with Python 3.10 venv - Add SLURM job submission scripts for dev and production runs - Add convenient submit_job.sh wrapper for easy job submission - Update .gitignore to allow shell scripts (removed *.sh exclusion) - Configure git remotes: upstream (original) and origin (fork) --- .gitignore | 1 - install_dppo.sh | 43 +++++++++++++++++++++++++++++ slurm/run_dppo_dev.sh | 52 +++++++++++++++++++++++++++++++++++ slurm/run_dppo_gym.sh | 64 +++++++++++++++++++++++++++++++++++++++++++ submit_job.sh | 46 +++++++++++++++++++++++++++++++ 5 files changed, 205 insertions(+), 1 deletion(-) create mode 100755 install_dppo.sh create mode 100755 slurm/run_dppo_dev.sh create mode 100755 slurm/run_dppo_gym.sh create mode 100755 submit_job.sh diff --git a/.gitignore b/.gitignore index b45dd92..f145a12 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,6 @@ checkpoints/ out/ err/ *.pkl -*.sh # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/install_dppo.sh b/install_dppo.sh new file mode 100755 index 0000000..2a594e9 --- /dev/null +++ b/install_dppo.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=dppo_install +#SBATCH --account=hk-project-p0022232 +#SBATCH --partition=dev_accelerated +#SBATCH --gres=gpu:1 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=4 +#SBATCH --time=00:30:00 +#SBATCH --mem=16G +#SBATCH --output=logs/dppo_install_%j.out +#SBATCH --error=logs/dppo_install_%j.err + +# Load CUDA module (required for PyTorch) +module load devel/cuda/12.4 + +# Print job info +echo "Starting DPPO installation..." +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "GPU: $CUDA_VISIBLE_DEVICES" + +# Navigate to dppo directory (uses current directory) +cd $SLURM_SUBMIT_DIR + +# Create and activate virtual environment with Python 3.10 +python3.10 -m venv .venv +source .venv/bin/activate + +# Upgrade pip +pip install --upgrade pip + +# Install base package +pip install -e . + +# Install gym dependencies (optional - comment out if not needed) +pip install -e .[gym] + +echo "Installation completed!" +echo "Python version: $(python --version)" +echo "Pip version: $(pip --version)" +echo "Installed packages:" +pip list \ No newline at end of file diff --git a/slurm/run_dppo_dev.sh b/slurm/run_dppo_dev.sh new file mode 100755 index 0000000..f55c587 --- /dev/null +++ b/slurm/run_dppo_dev.sh @@ -0,0 +1,52 @@ +#!/bin/bash +#SBATCH --job-name=dppo_dev_test +#SBATCH --account=hk-project-p0022232 +#SBATCH --partition=dev_accelerated +#SBATCH --gres=gpu:1 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --time=00:30:00 +#SBATCH --mem=24G +#SBATCH --output=logs/dppo_dev_%j.out +#SBATCH --error=logs/dppo_dev_%j.err + +# Load required modules +module load devel/cuda/12.4 + +# Set environment variables +export WANDB_MODE=online +export WANDB_PROJECT=dppo_dev_test +# export WANDB_API_KEY= # TODO: Set your API key +# export WANDB_ENTITY= # TODO: Set your entity (username or team) + +# Default paths (can be overridden by environment) +export DPPO_DATA_DIR=${DPPO_DATA_DIR:-$SLURM_SUBMIT_DIR/data} +export DPPO_LOG_DIR=${DPPO_LOG_DIR:-$SLURM_SUBMIT_DIR/log} + +# Change to project directory +cd $SLURM_SUBMIT_DIR + +# Activate virtual environment +source .venv/bin/activate + +# Run quick test with Gym Hopper (faster than other environments) +echo "Starting DPPO dev test..." +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "GPU: $CUDA_VISIBLE_DEVICES" +echo "" +echo "Python version: $(python --version)" +echo "PyTorch version: $(python -c 'import torch; print(torch.__version__)')" +echo "CUDA available: $(python -c 'import torch; print(torch.cuda.is_available())')" +echo "" + +# Run a quick pre-training test with reduced steps +python script/run.py --config-name=pre_diffusion_mlp \ + --config-dir=cfg/gym/pretrain/hopper-medium-v2 \ + train.n_iters=10 \ + train.log_interval=5 \ + train.checkpoint_interval=10 \ + wandb=${WANDB_MODE:-null} + +echo "Dev test completed!" \ No newline at end of file diff --git a/slurm/run_dppo_gym.sh b/slurm/run_dppo_gym.sh new file mode 100755 index 0000000..47b9e9f --- /dev/null +++ b/slurm/run_dppo_gym.sh @@ -0,0 +1,64 @@ +#!/bin/bash +#SBATCH --job-name=dppo_gym +#SBATCH --account=hk-project-p0022232 +#SBATCH --partition=accelerated +#SBATCH --gres=gpu:1 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --time=08:00:00 +#SBATCH --mem=32G +#SBATCH --output=logs/dppo_gym_%j.out +#SBATCH --error=logs/dppo_gym_%j.err + +# Load required modules +module load devel/cuda/12.4 + +# Set environment variables +export WANDB_MODE=online +export WANDB_PROJECT=dppo_gym +# export WANDB_API_KEY= # TODO: Set your API key +# export WANDB_ENTITY= # TODO: Set your entity (username or team) + +# Default paths (can be overridden by environment) +export DPPO_DATA_DIR=${DPPO_DATA_DIR:-$SLURM_SUBMIT_DIR/data} +export DPPO_LOG_DIR=${DPPO_LOG_DIR:-$SLURM_SUBMIT_DIR/log} + +# Parse command line arguments +TASK=${1:-hopper} # hopper, walker2d, halfcheetah +MODE=${2:-pretrain} # pretrain or finetune +CONFIG_TYPE=${3:-pre_diffusion_mlp} # pre_diffusion_mlp or ft_ppo_diffusion_mlp + +# Change to project directory +cd $SLURM_SUBMIT_DIR + +# Activate virtual environment +source .venv/bin/activate + +echo "Starting DPPO Gym experiment..." +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $SLURM_NODELIST" +echo "GPU: $CUDA_VISIBLE_DEVICES" +echo "Task: $TASK" +echo "Mode: $MODE" +echo "" + +# Select appropriate config based on mode +if [ "$MODE" = "pretrain" ]; then + CONFIG_DIR="cfg/gym/pretrain/${TASK}-medium-v2" + CONFIG_NAME="pre_diffusion_mlp" +elif [ "$MODE" = "finetune" ]; then + CONFIG_DIR="cfg/gym/finetune/${TASK}-v2" + CONFIG_NAME="ft_ppo_diffusion_mlp" +else + echo "Invalid mode: $MODE. Use 'pretrain' or 'finetune'" + exit 1 +fi + +# Run experiment +python script/run.py \ + --config-name=$CONFIG_NAME \ + --config-dir=$CONFIG_DIR \ + wandb=${WANDB_MODE:-null} + +echo "Experiment completed!" \ No newline at end of file diff --git a/submit_job.sh b/submit_job.sh new file mode 100755 index 0000000..87b4ab4 --- /dev/null +++ b/submit_job.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Submit DPPO jobs to SLURM +# Usage: ./submit_job.sh [mode] [task] [experiment_type] + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# Default values +MODE=${1:-dev} # dev, gym, robomimic, d3il, furniture +TASK=${2:-hopper} # Task specific to each mode +EXPERIMENT_TYPE=${3:-pretrain} # pretrain or finetune + +echo "Submitting DPPO job..." +echo "Mode: $MODE" +echo "Task: $TASK" +echo "Experiment type: $EXPERIMENT_TYPE" + +case $MODE in + dev) + echo "Submitting development test job..." + sbatch slurm/run_dppo_dev.sh + ;; + gym) + echo "Submitting Gym job..." + sbatch slurm/run_dppo_gym.sh "$TASK" "$EXPERIMENT_TYPE" + ;; + *) + echo "Unknown mode: $MODE" + echo "Supported modes: dev, gym" + echo "" + echo "Usage examples:" + echo " ./submit_job.sh dev # Run quick dev test" + echo " ./submit_job.sh gym hopper pretrain # Pre-train on Gym Hopper" + echo " ./submit_job.sh gym walker2d finetune # Fine-tune on Gym Walker2D" + exit 1 + ;; +esac + +echo "" +echo "Job submitted! Check status with:" +echo " squeue -u $USER" +echo "" +echo "Check logs in: logs/ directory" \ No newline at end of file