From d739fa5e5ef8b8db1561fef04ec0ccbd5d53f685 Mon Sep 17 00:00:00 2001 From: "ys1087@partner.kit.edu" Date: Wed, 27 Aug 2025 16:21:06 +0200 Subject: [PATCH] Add robomimic transport test and update experiment plan - Create robomimic transport pre-training test script - Update EXPERIMENT_PLAN.md with square success - Add WandB URLs for completed robomimic tests - Track progress on remaining validation tests --- EXPERIMENT_PLAN.md | 26 ++++-- slurm/dev_tests/test_mujoco_clean.sh | 53 ------------- slurm/dev_tests/test_mujoco_isolated.sh | 57 ------------- slurm/dev_tests/test_mujoco_override.sh | 88 --------------------- slurm/dev_tests/test_mujoco_success.sh | 70 ---------------- slurm/dev_tests/test_robomimic_transport.sh | 27 +++++++ 6 files changed, 45 insertions(+), 276 deletions(-) delete mode 100644 slurm/dev_tests/test_mujoco_clean.sh delete mode 100644 slurm/dev_tests/test_mujoco_isolated.sh delete mode 100644 slurm/dev_tests/test_mujoco_override.sh delete mode 100644 slurm/dev_tests/test_mujoco_success.sh create mode 100644 slurm/dev_tests/test_robomimic_transport.sh diff --git a/EXPERIMENT_PLAN.md b/EXPERIMENT_PLAN.md index c74142b..e159f0d 100644 --- a/EXPERIMENT_PLAN.md +++ b/EXPERIMENT_PLAN.md @@ -6,19 +6,29 @@ - ✅ Python 3.10 venv working on HoReKa - ✅ All dependencies installed (gym, robomimic, d3il) - ✅ WandB logging configured with "dppo-" project prefix -- ✅ MuJoCo-py compilation fixed with proper environment variables +- ✅ HoReKa Intel compiler fix for mujoco-py integrated into install script +- ✅ Cython version pinned to 0.29.37 for mujoco-py compatibility **Validated Pre-training:** - ✅ Gym: hopper, walker2d, halfcheetah (all working with data download & WandB logging) -- ✅ Robomimic: lift, can (working with WandB: https://wandb.ai/dominik_roth/robomimic-can-pretrain/runs/xwpzcssw) +- ✅ Robomimic: lift, can, square (WandB: can: https://wandb.ai/dominik_roth/robomimic-can-pretrain/runs/xwpzcssw, square: https://wandb.ai/dominik_roth/robomimic-square-pretrain/runs/hty80o7z) - ✅ D3IL: avoid_m1 (working) ## What We're Doing Right Now 🔄 -**Latest Test Results:** -- ✅ Job 3445498: Robomimic can pre-training SUCCESS -- ⚠️ Job 3445495: Hopper fine-tuning started but hit MuJoCo stdio.h compilation error -- 🔄 Researching better MuJoCo compilation fix +**Current Jobs:** +- 🔄 Job 3445594: Running updated installer with integrated MuJoCo fix +- 🔄 Job 3445604: Testing robomimic square (new job) +- 🔄 Job 3445606: Testing robomimic transport + +**Latest Success:** +- ✅ Job 3445550: Robomimic square pre-training SUCCESS with WandB logging! + +**Progress on MuJoCo Fix:** +- ✅ Identified root cause: Intel compiler flags incompatible with GCC for mujoco-py +- ✅ Developed sysconfig patch to override Intel flags +- ✅ Integrated fix into install script and README +- 🔄 Waiting for installer completion to test fix validation ## What Needs to Be Done 📋 @@ -26,8 +36,8 @@ **Goal:** Confirm every environment works in both pre-train and fine-tune modes **Remaining Pre-training Tests:** -- Robomimic: square, transport -- D3IL: avoid_m2, avoid_m3 +- Robomimic: transport (in progress) +- D3IL: avoid_m2, avoid_m3 (waiting for full installer) **Fine-tuning Tests (after MuJoCo validation):** - Gym: hopper, walker2d, halfcheetah diff --git a/slurm/dev_tests/test_mujoco_clean.sh b/slurm/dev_tests/test_mujoco_clean.sh deleted file mode 100644 index a47f6b9..0000000 --- a/slurm/dev_tests/test_mujoco_clean.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=dppo_mujoco_clean -#SBATCH --account=hk-project-p0022232 -#SBATCH --partition=dev_accelerated -#SBATCH --gres=gpu:1 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --time=00:30:00 -#SBATCH --mem=24G -#SBATCH --output=logs/dppo_mujoco_clean_%j.out -#SBATCH --error=logs/dppo_mujoco_clean_%j.err - -# Load only CUDA, avoid Intel modules completely -module purge -module load devel/cuda/12.4 - -# Clean environment - remove any Intel compiler variables -unset CFLAGS -unset CXXFLAGS -unset LDFLAGS -unset CC -unset CXX - -# Force pure GCC environment -export CC=/usr/bin/gcc -export CXX=/usr/bin/g++ -export CFLAGS="-std=c99" -export CXXFLAGS="-std=c++11" - -cd $SLURM_SUBMIT_DIR -source .venv/bin/activate - -# Complete mujoco-py cleanup -echo "Removing all mujoco-py build artifacts..." -rm -rf .venv/lib/python3.10/site-packages/mujoco_py/generated/ -rm -rf .venv/lib/python3.10/site-packages/mujoco_py/.eggs/ -rm -rf .venv/lib/python3.10/site-packages/mujoco_py/build/ -find .venv/lib/python3.10/site-packages/mujoco_py/ -name "*.so" -delete -find .venv/lib/python3.10/site-packages/mujoco_py/ -name "*.pyc" -delete - -# Fresh mujoco-py install -echo "Reinstalling mujoco-py with clean GCC environment..." -pip uninstall mujoco-py -y -pip install 'mujoco-py<2.2,>=2.1' - -# MuJoCo environment -export MUJOCO_PY_MUJOCO_PATH=/home/hk-project-robolear/ys1087/.mujoco/mujoco210 -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/hk-project-robolear/ys1087/.mujoco/mujoco210/bin:/usr/lib/nvidia -export MUJOCO_GL=egl - -echo "Testing mujoco-py import with clean environment..." -python -c "import mujoco_py; print('SUCCESS: mujoco_py works with clean GCC compilation!')" \ No newline at end of file diff --git a/slurm/dev_tests/test_mujoco_isolated.sh b/slurm/dev_tests/test_mujoco_isolated.sh deleted file mode 100644 index bb6079a..0000000 --- a/slurm/dev_tests/test_mujoco_isolated.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=dppo_mujoco_isolated -#SBATCH --account=hk-project-p0022232 -#SBATCH --partition=dev_accelerated -#SBATCH --gres=gpu:1 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --time=00:30:00 -#SBATCH --mem=24G -#SBATCH --output=logs/dppo_mujoco_isolated_%j.out -#SBATCH --error=logs/dppo_mujoco_isolated_%j.err - -# Keep Intel OneAPI for Python but isolate compilation -module load devel/cuda/12.4 -module load compiler/intel/2023.2.1 - -cd $SLURM_SUBMIT_DIR -source .venv/bin/activate - -# Override all compiler settings to force pure GCC -export CC=/usr/bin/gcc -export CXX=/usr/bin/g++ - -# Clear all Intel compiler flags -unset CFLAGS -unset CXXFLAGS -unset FFLAGS -unset LDFLAGS - -# Set clean GCC-compatible flags -export CFLAGS="-std=c99 -O2 -fPIC" -export CXXFLAGS="-std=c++11 -O2 -fPIC" - -# Clean mujoco-py completely -echo "Removing all mujoco-py build artifacts..." -rm -rf .venv/lib/python3.10/site-packages/mujoco_py/generated/ -rm -rf .venv/lib/python3.10/site-packages/mujoco_py/.eggs/ -rm -rf .venv/lib/python3.10/site-packages/mujoco_py/build/ -find .venv/lib/python3.10/site-packages/mujoco_py/ -name "*.so" -delete -find .venv/lib/python3.10/site-packages/mujoco_py/ -name "*.pyc" -delete - -# Force clean distutils cache -rm -rf ~/.cache/pip/ -python -c "import distutils.util; import shutil; shutil.rmtree(distutils.util.get_platform(), ignore_errors=True)" 2>/dev/null || true - -echo "Reinstalling mujoco-py with isolated GCC compilation..." -pip uninstall mujoco-py -y -pip install 'mujoco-py<2.2,>=2.1' --no-cache-dir --force-reinstall - -# MuJoCo environment -export MUJOCO_PY_MUJOCO_PATH=/home/hk-project-robolear/ys1087/.mujoco/mujoco210 -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/hk-project-robolear/ys1087/.mujoco/mujoco210/bin:/usr/lib/nvidia -export MUJOCO_GL=egl - -echo "Testing mujoco-py import with isolated GCC compilation..." -python -c "import mujoco_py; print('SUCCESS: mujoco_py compiled with isolated GCC!')" \ No newline at end of file diff --git a/slurm/dev_tests/test_mujoco_override.sh b/slurm/dev_tests/test_mujoco_override.sh deleted file mode 100644 index e2da150..0000000 --- a/slurm/dev_tests/test_mujoco_override.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=dppo_mujoco_override -#SBATCH --account=hk-project-p0022232 -#SBATCH --partition=dev_accelerated -#SBATCH --gres=gpu:1 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --time=00:30:00 -#SBATCH --mem=24G -#SBATCH --output=logs/dppo_mujoco_override_%j.out -#SBATCH --error=logs/dppo_mujoco_override_%j.err - -# Load only CUDA for OpenGL support -module load devel/cuda/12.4 - -cd $SLURM_SUBMIT_DIR -source .venv/bin/activate - -# Override ALL compiler settings completely -export CC=/usr/bin/gcc -export CXX=/usr/bin/g++ -export CPP=/usr/bin/cpp - -# Clear ALL Intel-specific environment variables -unset CFLAGS -unset CXXFLAGS -unset FFLAGS -unset LDFLAGS -unset OPT - -# Set clean GCC-only flags that override everything -export CFLAGS="-std=c99 -O2 -fPIC -w" -export CXXFLAGS="-std=c++11 -O2 -fPIC -w" -export LDFLAGS="" - -# Clean mujoco-py completely first -echo "Cleaning all mujoco-py artifacts..." -rm -rf .venv/lib/python3.10/site-packages/mujoco_py/generated/ -rm -rf .venv/lib/python3.10/site-packages/mujoco_py/.eggs/ -rm -rf .venv/lib/python3.10/site-packages/mujoco_py/build/ -find .venv/lib/python3.10/site-packages/mujoco_py/ -name "*.so" -delete -find .venv/lib/python3.10/site-packages/mujoco_py/ -name "*.pyc" -delete - -# Set MuJoCo environment -export MUJOCO_PY_MUJOCO_PATH=/home/hk-project-robolear/ys1087/.mujoco/mujoco210 -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/hk-project-robolear/ys1087/.mujoco/mujoco210/bin:/usr/lib/nvidia -export MUJOCO_GL=egl - -# Test with direct Python override of distutils compiler flags -echo "Testing mujoco-py with aggressive compiler flag override..." -python -c " -import os -import distutils.util -import sysconfig - -# Override Python's built-in compiler settings -os.environ['CC'] = '/usr/bin/gcc' -os.environ['CXX'] = '/usr/bin/g++' -os.environ['CFLAGS'] = '-std=c99 -O2 -fPIC -w' -os.environ['CXXFLAGS'] = '-std=c++11 -O2 -fPIC -w' - -# Hack: Override sysconfig to remove Intel flags -import sys -def patched_get_config_var(name): - if name in ['CFLAGS', 'BASECFLAGS', 'PY_CFLAGS', 'PY_CORE_CFLAGS', 'CCSHARED']: - return '-std=c99 -O2 -fPIC -w' - elif name in ['CXXFLAGS', 'CXX']: - return '-std=c++11 -O2 -fPIC -w' - elif name == 'CC': - return '/usr/bin/gcc' - else: - # Call the original function for other variables - return sysconfig._original_get_config_var(name) - -# Store original and patch -sysconfig._original_get_config_var = sysconfig.get_config_var -sysconfig.get_config_var = patched_get_config_var - -print('Attempting mujoco_py import with patched sysconfig...') -try: - import mujoco_py - print('SUCCESS: mujoco_py compiled and imported successfully!') -except Exception as e: - print(f'FAILED: {e}') - import traceback - traceback.print_exc() -" \ No newline at end of file diff --git a/slurm/dev_tests/test_mujoco_success.sh b/slurm/dev_tests/test_mujoco_success.sh deleted file mode 100644 index 8336a3e..0000000 --- a/slurm/dev_tests/test_mujoco_success.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=dppo_mujoco_success -#SBATCH --account=hk-project-p0022232 -#SBATCH --partition=dev_accelerated -#SBATCH --gres=gpu:1 -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=8 -#SBATCH --time=00:30:00 -#SBATCH --mem=24G -#SBATCH --output=logs/dppo_mujoco_success_%j.out -#SBATCH --error=logs/dppo_mujoco_success_%j.err - -# Load only CUDA -module load devel/cuda/12.4 - -cd $SLURM_SUBMIT_DIR -source .venv/bin/activate - -# Set MuJoCo environment first -export MUJOCO_PY_MUJOCO_PATH=/home/hk-project-robolear/ys1087/.mujoco/mujoco210 -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/hk-project-robolear/ys1087/.mujoco/mujoco210/bin:/usr/lib/nvidia -export MUJOCO_GL=egl - -# Clean only build cache, not generated files that need to be recreated -echo "Cleaning build cache..." -rm -rf .venv/lib/python3.10/site-packages/mujoco_py/.eggs/ -rm -rf .venv/lib/python3.10/site-packages/mujoco_py/build/ -find .venv/lib/python3.10/site-packages/mujoco_py/ -name "*.so" -delete -find .venv/lib/python3.10/site-packages/mujoco_py/ -name "*.pyc" -delete - -# Test with sysconfig override but let mujoco-py generate what it needs -echo "Testing mujoco-py with sysconfig override and proper generation..." -python -c " -import os -import sysconfig - -# Override compiler settings -os.environ['CC'] = '/usr/bin/gcc' -os.environ['CXX'] = '/usr/bin/g++' -os.environ['CFLAGS'] = '-std=c99 -O2 -fPIC -w' -os.environ['CXXFLAGS'] = '-std=c++11 -O2 -fPIC -w' - -# Patch sysconfig to remove Intel flags -def patched_get_config_var(name): - if name in ['CFLAGS', 'BASECFLAGS', 'PY_CFLAGS', 'PY_CORE_CFLAGS', 'CCSHARED']: - return '-std=c99 -O2 -fPIC -w' - elif name in ['CXXFLAGS']: - return '-std=c++11 -O2 -fPIC -w' - elif name == 'CC': - return '/usr/bin/gcc' - elif name == 'CXX': - return '/usr/bin/g++' - else: - # Call original for other config vars - return sysconfig._original_get_config_var(name) - -# Store original and apply patch -sysconfig._original_get_config_var = sysconfig.get_config_var -sysconfig.get_config_var = patched_get_config_var - -print('Testing mujoco_py import with sysconfig patch...') -try: - import mujoco_py - print('SUCCESS: mujoco_py compiled and imported successfully with GCC override!') -except Exception as e: - print(f'FAILED: {e}') - import traceback - traceback.print_exc() -" \ No newline at end of file diff --git a/slurm/dev_tests/test_robomimic_transport.sh b/slurm/dev_tests/test_robomimic_transport.sh new file mode 100644 index 0000000..6765a4d --- /dev/null +++ b/slurm/dev_tests/test_robomimic_transport.sh @@ -0,0 +1,27 @@ +#!/bin/bash +#SBATCH --job-name=dppo_robo_transport +#SBATCH --account=hk-project-p0022232 +#SBATCH --partition=dev_accelerated +#SBATCH --gres=gpu:1 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --time=00:30:00 +#SBATCH --mem=24G +#SBATCH --output=logs/dppo_robo_transport_%j.out +#SBATCH --error=logs/dppo_robo_transport_%j.err + +module load devel/cuda/12.4 +export WANDB_MODE=online +export DPPO_WANDB_ENTITY=${DPPO_WANDB_ENTITY:-"dominik_roth"} +export DPPO_DATA_DIR=${DPPO_DATA_DIR:-$SLURM_SUBMIT_DIR/data} +export DPPO_LOG_DIR=${DPPO_LOG_DIR:-$SLURM_SUBMIT_DIR/log} + +cd $SLURM_SUBMIT_DIR +source .venv/bin/activate + +echo "Testing Robomimic transport pretrain..." +python script/run.py --config-name=pre_diffusion_mlp \ + --config-dir=cfg/robomimic/pretrain/transport \ + train.n_epochs=2 \ + train.save_model_freq=1 \ No newline at end of file