- Add installation script for HoReKa with Python 3.10 venv - Add SLURM job submission scripts for dev and production runs - Add convenient submit_job.sh wrapper for easy job submission - Update .gitignore to allow shell scripts (removed *.sh exclusion) - Configure git remotes: upstream (original) and origin (fork)
52 lines
1.6 KiB
Bash
Executable File
52 lines
1.6 KiB
Bash
Executable File
#!/bin/bash
|
|
#SBATCH --job-name=dppo_dev_test
|
|
#SBATCH --account=hk-project-p0022232
|
|
#SBATCH --partition=dev_accelerated
|
|
#SBATCH --gres=gpu:1
|
|
#SBATCH --nodes=1
|
|
#SBATCH --ntasks-per-node=1
|
|
#SBATCH --cpus-per-task=8
|
|
#SBATCH --time=00:30:00
|
|
#SBATCH --mem=24G
|
|
#SBATCH --output=logs/dppo_dev_%j.out
|
|
#SBATCH --error=logs/dppo_dev_%j.err
|
|
|
|
# Load required modules
|
|
module load devel/cuda/12.4
|
|
|
|
# Set environment variables
|
|
export WANDB_MODE=online
|
|
export WANDB_PROJECT=dppo_dev_test
|
|
# export WANDB_API_KEY=<your_api_key> # TODO: Set your API key
|
|
# export WANDB_ENTITY=<your_entity> # TODO: Set your entity (username or team)
|
|
|
|
# Default paths (can be overridden by environment)
|
|
export DPPO_DATA_DIR=${DPPO_DATA_DIR:-$SLURM_SUBMIT_DIR/data}
|
|
export DPPO_LOG_DIR=${DPPO_LOG_DIR:-$SLURM_SUBMIT_DIR/log}
|
|
|
|
# Change to project directory
|
|
cd $SLURM_SUBMIT_DIR
|
|
|
|
# Activate virtual environment
|
|
source .venv/bin/activate
|
|
|
|
# Run quick test with Gym Hopper (faster than other environments)
|
|
echo "Starting DPPO dev test..."
|
|
echo "Job ID: $SLURM_JOB_ID"
|
|
echo "Node: $SLURM_NODELIST"
|
|
echo "GPU: $CUDA_VISIBLE_DEVICES"
|
|
echo ""
|
|
echo "Python version: $(python --version)"
|
|
echo "PyTorch version: $(python -c 'import torch; print(torch.__version__)')"
|
|
echo "CUDA available: $(python -c 'import torch; print(torch.cuda.is_available())')"
|
|
echo ""
|
|
|
|
# Run a quick pre-training test with reduced steps
|
|
python script/run.py --config-name=pre_diffusion_mlp \
|
|
--config-dir=cfg/gym/pretrain/hopper-medium-v2 \
|
|
train.n_iters=10 \
|
|
train.log_interval=5 \
|
|
train.checkpoint_interval=10 \
|
|
wandb=${WANDB_MODE:-null}
|
|
|
|
echo "Dev test completed!" |