dppo/install_dppo.sh
ys1087@partner.kit.edu 7e800c9a33 Complete MuJoCo fix and validate hopper fine-tuning
- Add GCC wrapper script to filter Intel compiler flags
- Download missing mujoco-py generated files automatically
- Update installer with comprehensive MuJoCo fixes
- Document complete solution in README and EXPERIMENT_PLAN
- Hopper fine-tuning validated with reward 1415.8471
- All pre-training environments working
- DPPO is now production-ready on HoReKa
2025-08-27 18:27:02 +02:00

143 lines
4.9 KiB
Bash
Executable File

#!/bin/bash
#SBATCH --job-name=dppo_install
#SBATCH --account=hk-project-p0022232
#SBATCH --partition=dev_accelerated
#SBATCH --gres=gpu:1
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=4
#SBATCH --time=00:30:00
#SBATCH --mem=16G
#SBATCH --output=logs/dppo_install_%j.out
#SBATCH --error=logs/dppo_install_%j.err
# Load CUDA module (required for PyTorch)
module load devel/cuda/12.4
# Print job info
echo "Starting DPPO installation..."
echo "Job ID: $SLURM_JOB_ID"
echo "Node: $SLURM_NODELIST"
echo "GPU: $CUDA_VISIBLE_DEVICES"
# Navigate to dppo directory (uses current directory)
cd $SLURM_SUBMIT_DIR
# Create and activate virtual environment with Python 3.10
python3.10 -m venv .venv
source .venv/bin/activate
# Upgrade pip
pip install --upgrade pip
# Install base package
pip install -e .
# Install ALL optional dependencies (except Kitchen which has conflicts)
pip install -e .[all]
# HoReKa-specific MuJoCo compilation fix
echo ""
echo "=== HoReKa Cluster MuJoCo Fix ==="
echo "Applying Intel compiler compatibility fix for mujoco-py..."
# Pin compatible Cython version first
pip install 'Cython==0.29.37' --force-reinstall
# Create GCC wrapper that filters Intel compiler flags
cat > gcc_wrapper.sh << 'EOF'
#!/bin/bash
# GCC wrapper that filters out Intel compiler flags
args=()
for arg in "$@"; do
case "$arg" in
-xCORE-AVX2|--xCORE-AVX2|-xHost|--xHost)
# Skip Intel-specific flags
;;
*)
args+=("$arg")
;;
esac
done
exec /usr/bin/gcc "${args[@]}"
EOF
chmod +x gcc_wrapper.sh
# Create MuJoCo compilation fix script
cat > fix_mujoco_compilation.py << 'EOF'
import os
import sysconfig
def apply_mujoco_fix():
"""Apply HoReKa Intel compiler compatibility fix for mujoco-py"""
# Override compiler settings with wrapper that filters Intel flags
wrapper_path = os.path.abspath('gcc_wrapper.sh')
os.environ['CC'] = wrapper_path
os.environ['CXX'] = '/usr/bin/g++'
os.environ['CFLAGS'] = '-std=c99 -O2 -fPIC -w'
os.environ['CXXFLAGS'] = '-std=c++11 -O2 -fPIC -w'
# Patch sysconfig to remove Intel compiler flags
if not hasattr(sysconfig, '_original_get_config_var'):
def patched_get_config_var(name):
if name in ['CFLAGS', 'BASECFLAGS', 'PY_CFLAGS', 'PY_CORE_CFLAGS', 'CCSHARED', 'OPT']:
return '-std=c99 -O2 -fPIC -w'
elif name in ['CXXFLAGS']:
return '-std=c++11 -O2 -fPIC -w'
elif name == 'CC':
return wrapper_path
elif name == 'CXX':
return '/usr/bin/g++'
else:
return sysconfig._original_get_config_var(name)
sysconfig._original_get_config_var = sysconfig.get_config_var
sysconfig.get_config_var = patched_get_config_var
# Also patch distutils directly
import distutils.util
import distutils.ccompiler
def patched_customize_compiler(compiler):
compiler.set_executable('compiler_so', wrapper_path)
compiler.set_executable('compiler_cxx', '/usr/bin/g++')
compiler.set_executable('linker_so', wrapper_path + ' -shared')
return compiler
# Override customize_compiler function
distutils.ccompiler.customize_compiler = patched_customize_compiler
print("Applied HoReKa MuJoCo compilation fix")
if __name__ == "__main__":
apply_mujoco_fix()
EOF
echo "Created MuJoCo compilation fix script"
# Download missing mujoco-py generated files (common issue on HoReKa)
echo "Downloading missing mujoco-py generated files..."
mkdir -p .venv/lib/python3.10/site-packages/mujoco_py/generated
curl -s https://raw.githubusercontent.com/openai/mujoco-py/master/mujoco_py/generated/wrappers.pxi -o .venv/lib/python3.10/site-packages/mujoco_py/generated/wrappers.pxi
curl -s https://raw.githubusercontent.com/openai/mujoco-py/master/mujoco_py/generated/__init__.py -o .venv/lib/python3.10/site-packages/mujoco_py/generated/__init__.py
curl -s https://raw.githubusercontent.com/openai/mujoco-py/master/mujoco_py/generated/const.py -o .venv/lib/python3.10/site-packages/mujoco_py/generated/const.py
curl -s https://raw.githubusercontent.com/openai/mujoco-py/master/mujoco_py/generated/wrappers.py -o .venv/lib/python3.10/site-packages/mujoco_py/generated/wrappers.py
echo "Downloaded missing generated files"
echo ""
echo "Installation completed!"
echo "Python version: $(python --version)"
echo "Pip version: $(pip --version)"
echo ""
echo "=== IMPORTANT: MuJoCo Setup for Fine-tuning ==="
echo "1. Install MuJoCo 2.1.0: https://github.com/openai/mujoco-py#install-mujoco"
echo "2. Add these environment variables to your SLURM scripts:"
echo "export MUJOCO_PY_MUJOCO_PATH=\$HOME/.mujoco/mujoco210"
echo "export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:\$HOME/.mujoco/mujoco210/bin:/usr/lib/nvidia"
echo "export MUJOCO_GL=egl"
echo ""
echo "Pre-training works without MuJoCo setup."
echo ""
echo "Installed packages:"
pip list