Add HoReKa cluster setup and SLURM scripts

- Add installation script for HoReKa with Python 3.10 venv
- Add SLURM job submission scripts for dev and production runs
- Add convenient submit_job.sh wrapper for easy job submission
- Update .gitignore to allow shell scripts (removed *.sh exclusion)
- Configure git remotes: upstream (original) and origin (fork)
This commit is contained in:
ys1087@partner.kit.edu 2025-08-27 11:57:32 +02:00
parent cc7234ad7f
commit 05dddfa10c
5 changed files with 205 additions and 1 deletions

1
.gitignore vendored
View File

@ -10,7 +10,6 @@ checkpoints/
out/
err/
*.pkl
*.sh
# Byte-compiled / optimized / DLL files
__pycache__/

43
install_dppo.sh Executable file
View File

@ -0,0 +1,43 @@
#!/bin/bash
#SBATCH --job-name=dppo_install
#SBATCH --account=hk-project-p0022232
#SBATCH --partition=dev_accelerated
#SBATCH --gres=gpu:1
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=4
#SBATCH --time=00:30:00
#SBATCH --mem=16G
#SBATCH --output=logs/dppo_install_%j.out
#SBATCH --error=logs/dppo_install_%j.err
# Load CUDA module (required for PyTorch)
module load devel/cuda/12.4
# Print job info
echo "Starting DPPO installation..."
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "GPU: $CUDA_VISIBLE_DEVICES"
# Navigate to dppo directory (uses current directory)
cd $SLURM_SUBMIT_DIR
# Create and activate virtual environment with Python 3.10
python3.10 -m venv .venv
source .venv/bin/activate
# Upgrade pip
pip install --upgrade pip
# Install base package
pip install -e .
# Install gym dependencies (optional - comment out if not needed)
pip install -e .[gym]
echo "Installation completed!"
echo "Python version: $(python --version)"
echo "Pip version: $(pip --version)"
echo "Installed packages:"
pip list

52
slurm/run_dppo_dev.sh Executable file
View File

@ -0,0 +1,52 @@
#!/bin/bash
#SBATCH --job-name=dppo_dev_test
#SBATCH --account=hk-project-p0022232
#SBATCH --partition=dev_accelerated
#SBATCH --gres=gpu:1
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --time=00:30:00
#SBATCH --mem=24G
#SBATCH --output=logs/dppo_dev_%j.out
#SBATCH --error=logs/dppo_dev_%j.err
# Load required modules
module load devel/cuda/12.4
# Set environment variables
export WANDB_MODE=online
export WANDB_PROJECT=dppo_dev_test
# export WANDB_API_KEY=<your_api_key> # TODO: Set your API key
# export WANDB_ENTITY=<your_entity> # TODO: Set your entity (username or team)
# Default paths (can be overridden by environment)
export DPPO_DATA_DIR=${DPPO_DATA_DIR:-$SLURM_SUBMIT_DIR/data}
export DPPO_LOG_DIR=${DPPO_LOG_DIR:-$SLURM_SUBMIT_DIR/log}
# Change to project directory
cd $SLURM_SUBMIT_DIR
# Activate virtual environment
source .venv/bin/activate
# Run quick test with Gym Hopper (faster than other environments)
echo "Starting DPPO dev test..."
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "GPU: $CUDA_VISIBLE_DEVICES"
echo ""
echo "Python version: $(python --version)"
echo "PyTorch version: $(python -c 'import torch; print(torch.__version__)')"
echo "CUDA available: $(python -c 'import torch; print(torch.cuda.is_available())')"
echo ""
# Run a quick pre-training test with reduced steps
python script/run.py --config-name=pre_diffusion_mlp \
--config-dir=cfg/gym/pretrain/hopper-medium-v2 \
train.n_iters=10 \
train.log_interval=5 \
train.checkpoint_interval=10 \
wandb=${WANDB_MODE:-null}
echo "Dev test completed!"

64
slurm/run_dppo_gym.sh Executable file
View File

@ -0,0 +1,64 @@
#!/bin/bash
#SBATCH --job-name=dppo_gym
#SBATCH --account=hk-project-p0022232
#SBATCH --partition=accelerated
#SBATCH --gres=gpu:1
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=40
#SBATCH --time=08:00:00
#SBATCH --mem=32G
#SBATCH --output=logs/dppo_gym_%j.out
#SBATCH --error=logs/dppo_gym_%j.err
# Load required modules
module load devel/cuda/12.4
# Set environment variables
export WANDB_MODE=online
export WANDB_PROJECT=dppo_gym
# export WANDB_API_KEY=<your_api_key> # TODO: Set your API key
# export WANDB_ENTITY=<your_entity> # TODO: Set your entity (username or team)
# Default paths (can be overridden by environment)
export DPPO_DATA_DIR=${DPPO_DATA_DIR:-$SLURM_SUBMIT_DIR/data}
export DPPO_LOG_DIR=${DPPO_LOG_DIR:-$SLURM_SUBMIT_DIR/log}
# Parse command line arguments
TASK=${1:-hopper} # hopper, walker2d, halfcheetah
MODE=${2:-pretrain} # pretrain or finetune
CONFIG_TYPE=${3:-pre_diffusion_mlp} # pre_diffusion_mlp or ft_ppo_diffusion_mlp
# Change to project directory
cd $SLURM_SUBMIT_DIR
# Activate virtual environment
source .venv/bin/activate
echo "Starting DPPO Gym experiment..."
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "GPU: $CUDA_VISIBLE_DEVICES"
echo "Task: $TASK"
echo "Mode: $MODE"
echo ""
# Select appropriate config based on mode
if [ "$MODE" = "pretrain" ]; then
CONFIG_DIR="cfg/gym/pretrain/${TASK}-medium-v2"
CONFIG_NAME="pre_diffusion_mlp"
elif [ "$MODE" = "finetune" ]; then
CONFIG_DIR="cfg/gym/finetune/${TASK}-v2"
CONFIG_NAME="ft_ppo_diffusion_mlp"
else
echo "Invalid mode: $MODE. Use 'pretrain' or 'finetune'"
exit 1
fi
# Run experiment
python script/run.py \
--config-name=$CONFIG_NAME \
--config-dir=$CONFIG_DIR \
wandb=${WANDB_MODE:-null}
echo "Experiment completed!"

46
submit_job.sh Executable file
View File

@ -0,0 +1,46 @@
#!/bin/bash
# Submit DPPO jobs to SLURM
# Usage: ./submit_job.sh [mode] [task] [experiment_type]
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# Default values
MODE=${1:-dev} # dev, gym, robomimic, d3il, furniture
TASK=${2:-hopper} # Task specific to each mode
EXPERIMENT_TYPE=${3:-pretrain} # pretrain or finetune
echo "Submitting DPPO job..."
echo "Mode: $MODE"
echo "Task: $TASK"
echo "Experiment type: $EXPERIMENT_TYPE"
case $MODE in
dev)
echo "Submitting development test job..."
sbatch slurm/run_dppo_dev.sh
;;
gym)
echo "Submitting Gym job..."
sbatch slurm/run_dppo_gym.sh "$TASK" "$EXPERIMENT_TYPE"
;;
*)
echo "Unknown mode: $MODE"
echo "Supported modes: dev, gym"
echo ""
echo "Usage examples:"
echo " ./submit_job.sh dev # Run quick dev test"
echo " ./submit_job.sh gym hopper pretrain # Pre-train on Gym Hopper"
echo " ./submit_job.sh gym walker2d finetune # Fine-tune on Gym Walker2D"
exit 1
;;
esac
echo ""
echo "Job submitted! Check status with:"
echo " squeue -u $USER"
echo ""
echo "Check logs in: logs/ directory"