Add HoReKa cluster setup and SLURM scripts
- Add installation script for HoReKa with Python 3.10 venv - Add SLURM job submission scripts for dev and production runs - Add convenient submit_job.sh wrapper for easy job submission - Update .gitignore to allow shell scripts (removed *.sh exclusion) - Configure git remotes: upstream (original) and origin (fork)
This commit is contained in:
parent
cc7234ad7f
commit
05dddfa10c
1
.gitignore
vendored
1
.gitignore
vendored
@ -10,7 +10,6 @@ checkpoints/
|
|||||||
out/
|
out/
|
||||||
err/
|
err/
|
||||||
*.pkl
|
*.pkl
|
||||||
*.sh
|
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
43
install_dppo.sh
Executable file
43
install_dppo.sh
Executable file
@ -0,0 +1,43 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH --job-name=dppo_install
|
||||||
|
#SBATCH --account=hk-project-p0022232
|
||||||
|
#SBATCH --partition=dev_accelerated
|
||||||
|
#SBATCH --gres=gpu:1
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks-per-node=1
|
||||||
|
#SBATCH --cpus-per-task=4
|
||||||
|
#SBATCH --time=00:30:00
|
||||||
|
#SBATCH --mem=16G
|
||||||
|
#SBATCH --output=logs/dppo_install_%j.out
|
||||||
|
#SBATCH --error=logs/dppo_install_%j.err
|
||||||
|
|
||||||
|
# Load CUDA module (required for PyTorch)
|
||||||
|
module load devel/cuda/12.4
|
||||||
|
|
||||||
|
# Print job info
|
||||||
|
echo "Starting DPPO installation..."
|
||||||
|
echo "Job ID: $SLURM_JOB_ID"
|
||||||
|
echo "Node: $SLURM_NODELIST"
|
||||||
|
echo "GPU: $CUDA_VISIBLE_DEVICES"
|
||||||
|
|
||||||
|
# Navigate to dppo directory (uses current directory)
|
||||||
|
cd $SLURM_SUBMIT_DIR
|
||||||
|
|
||||||
|
# Create and activate virtual environment with Python 3.10
|
||||||
|
python3.10 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Upgrade pip
|
||||||
|
pip install --upgrade pip
|
||||||
|
|
||||||
|
# Install base package
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
# Install gym dependencies (optional - comment out if not needed)
|
||||||
|
pip install -e .[gym]
|
||||||
|
|
||||||
|
echo "Installation completed!"
|
||||||
|
echo "Python version: $(python --version)"
|
||||||
|
echo "Pip version: $(pip --version)"
|
||||||
|
echo "Installed packages:"
|
||||||
|
pip list
|
52
slurm/run_dppo_dev.sh
Executable file
52
slurm/run_dppo_dev.sh
Executable file
@ -0,0 +1,52 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH --job-name=dppo_dev_test
|
||||||
|
#SBATCH --account=hk-project-p0022232
|
||||||
|
#SBATCH --partition=dev_accelerated
|
||||||
|
#SBATCH --gres=gpu:1
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks-per-node=1
|
||||||
|
#SBATCH --cpus-per-task=8
|
||||||
|
#SBATCH --time=00:30:00
|
||||||
|
#SBATCH --mem=24G
|
||||||
|
#SBATCH --output=logs/dppo_dev_%j.out
|
||||||
|
#SBATCH --error=logs/dppo_dev_%j.err
|
||||||
|
|
||||||
|
# Load required modules
|
||||||
|
module load devel/cuda/12.4
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
export WANDB_MODE=online
|
||||||
|
export WANDB_PROJECT=dppo_dev_test
|
||||||
|
# export WANDB_API_KEY=<your_api_key> # TODO: Set your API key
|
||||||
|
# export WANDB_ENTITY=<your_entity> # TODO: Set your entity (username or team)
|
||||||
|
|
||||||
|
# Default paths (can be overridden by environment)
|
||||||
|
export DPPO_DATA_DIR=${DPPO_DATA_DIR:-$SLURM_SUBMIT_DIR/data}
|
||||||
|
export DPPO_LOG_DIR=${DPPO_LOG_DIR:-$SLURM_SUBMIT_DIR/log}
|
||||||
|
|
||||||
|
# Change to project directory
|
||||||
|
cd $SLURM_SUBMIT_DIR
|
||||||
|
|
||||||
|
# Activate virtual environment
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Run quick test with Gym Hopper (faster than other environments)
|
||||||
|
echo "Starting DPPO dev test..."
|
||||||
|
echo "Job ID: $SLURM_JOB_ID"
|
||||||
|
echo "Node: $SLURM_NODELIST"
|
||||||
|
echo "GPU: $CUDA_VISIBLE_DEVICES"
|
||||||
|
echo ""
|
||||||
|
echo "Python version: $(python --version)"
|
||||||
|
echo "PyTorch version: $(python -c 'import torch; print(torch.__version__)')"
|
||||||
|
echo "CUDA available: $(python -c 'import torch; print(torch.cuda.is_available())')"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Run a quick pre-training test with reduced steps
|
||||||
|
python script/run.py --config-name=pre_diffusion_mlp \
|
||||||
|
--config-dir=cfg/gym/pretrain/hopper-medium-v2 \
|
||||||
|
train.n_iters=10 \
|
||||||
|
train.log_interval=5 \
|
||||||
|
train.checkpoint_interval=10 \
|
||||||
|
wandb=${WANDB_MODE:-null}
|
||||||
|
|
||||||
|
echo "Dev test completed!"
|
64
slurm/run_dppo_gym.sh
Executable file
64
slurm/run_dppo_gym.sh
Executable file
@ -0,0 +1,64 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH --job-name=dppo_gym
|
||||||
|
#SBATCH --account=hk-project-p0022232
|
||||||
|
#SBATCH --partition=accelerated
|
||||||
|
#SBATCH --gres=gpu:1
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
#SBATCH --ntasks-per-node=1
|
||||||
|
#SBATCH --cpus-per-task=40
|
||||||
|
#SBATCH --time=08:00:00
|
||||||
|
#SBATCH --mem=32G
|
||||||
|
#SBATCH --output=logs/dppo_gym_%j.out
|
||||||
|
#SBATCH --error=logs/dppo_gym_%j.err
|
||||||
|
|
||||||
|
# Load required modules
|
||||||
|
module load devel/cuda/12.4
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
export WANDB_MODE=online
|
||||||
|
export WANDB_PROJECT=dppo_gym
|
||||||
|
# export WANDB_API_KEY=<your_api_key> # TODO: Set your API key
|
||||||
|
# export WANDB_ENTITY=<your_entity> # TODO: Set your entity (username or team)
|
||||||
|
|
||||||
|
# Default paths (can be overridden by environment)
|
||||||
|
export DPPO_DATA_DIR=${DPPO_DATA_DIR:-$SLURM_SUBMIT_DIR/data}
|
||||||
|
export DPPO_LOG_DIR=${DPPO_LOG_DIR:-$SLURM_SUBMIT_DIR/log}
|
||||||
|
|
||||||
|
# Parse command line arguments
|
||||||
|
TASK=${1:-hopper} # hopper, walker2d, halfcheetah
|
||||||
|
MODE=${2:-pretrain} # pretrain or finetune
|
||||||
|
CONFIG_TYPE=${3:-pre_diffusion_mlp} # pre_diffusion_mlp or ft_ppo_diffusion_mlp
|
||||||
|
|
||||||
|
# Change to project directory
|
||||||
|
cd $SLURM_SUBMIT_DIR
|
||||||
|
|
||||||
|
# Activate virtual environment
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
echo "Starting DPPO Gym experiment..."
|
||||||
|
echo "Job ID: $SLURM_JOB_ID"
|
||||||
|
echo "Node: $SLURM_NODELIST"
|
||||||
|
echo "GPU: $CUDA_VISIBLE_DEVICES"
|
||||||
|
echo "Task: $TASK"
|
||||||
|
echo "Mode: $MODE"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Select appropriate config based on mode
|
||||||
|
if [ "$MODE" = "pretrain" ]; then
|
||||||
|
CONFIG_DIR="cfg/gym/pretrain/${TASK}-medium-v2"
|
||||||
|
CONFIG_NAME="pre_diffusion_mlp"
|
||||||
|
elif [ "$MODE" = "finetune" ]; then
|
||||||
|
CONFIG_DIR="cfg/gym/finetune/${TASK}-v2"
|
||||||
|
CONFIG_NAME="ft_ppo_diffusion_mlp"
|
||||||
|
else
|
||||||
|
echo "Invalid mode: $MODE. Use 'pretrain' or 'finetune'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run experiment
|
||||||
|
python script/run.py \
|
||||||
|
--config-name=$CONFIG_NAME \
|
||||||
|
--config-dir=$CONFIG_DIR \
|
||||||
|
wandb=${WANDB_MODE:-null}
|
||||||
|
|
||||||
|
echo "Experiment completed!"
|
46
submit_job.sh
Executable file
46
submit_job.sh
Executable file
@ -0,0 +1,46 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Submit DPPO jobs to SLURM
|
||||||
|
# Usage: ./submit_job.sh [mode] [task] [experiment_type]
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
# Default values
|
||||||
|
MODE=${1:-dev} # dev, gym, robomimic, d3il, furniture
|
||||||
|
TASK=${2:-hopper} # Task specific to each mode
|
||||||
|
EXPERIMENT_TYPE=${3:-pretrain} # pretrain or finetune
|
||||||
|
|
||||||
|
echo "Submitting DPPO job..."
|
||||||
|
echo "Mode: $MODE"
|
||||||
|
echo "Task: $TASK"
|
||||||
|
echo "Experiment type: $EXPERIMENT_TYPE"
|
||||||
|
|
||||||
|
case $MODE in
|
||||||
|
dev)
|
||||||
|
echo "Submitting development test job..."
|
||||||
|
sbatch slurm/run_dppo_dev.sh
|
||||||
|
;;
|
||||||
|
gym)
|
||||||
|
echo "Submitting Gym job..."
|
||||||
|
sbatch slurm/run_dppo_gym.sh "$TASK" "$EXPERIMENT_TYPE"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown mode: $MODE"
|
||||||
|
echo "Supported modes: dev, gym"
|
||||||
|
echo ""
|
||||||
|
echo "Usage examples:"
|
||||||
|
echo " ./submit_job.sh dev # Run quick dev test"
|
||||||
|
echo " ./submit_job.sh gym hopper pretrain # Pre-train on Gym Hopper"
|
||||||
|
echo " ./submit_job.sh gym walker2d finetune # Fine-tune on Gym Walker2D"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Job submitted! Check status with:"
|
||||||
|
echo " squeue -u $USER"
|
||||||
|
echo ""
|
||||||
|
echo "Check logs in: logs/ directory"
|
Loading…
Reference in New Issue
Block a user