Skip to content

Commit

Permalink
Fix unit tests environment variables (espressomd#4607)
Browse files Browse the repository at this point in the history
Description of changes:
- properly set unit test environment variables related to sanitizers
- address Open MPI version 4.x singleton mode bug on NUMA architectures
   - disable MCA binding policy on unit tests running in singleton mode on affected NUMA architectures
   - print a warning in pypresso when running in singleton mode on affected NUMA architectures (can be disabled with a CMake option)
  • Loading branch information
kodiakhq[bot] authored and jngrad committed Nov 25, 2022
1 parent 203b050 commit ac2332e
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 14 deletions.
41 changes: 36 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT APPLE)
"Build with memory sanitizer (experimental; requires a memory-sanitized Python interpreter)"
OFF)
endif()
option(
ESPRESSO_ADD_OMPI_SINGLETON_WARNING
"Add a runtime warning in the pypresso script for NUMA architectures that aren't supported in singleton mode by Open MPI 4.x"
ON)
option(WARNINGS_ARE_ERRORS "Treat warnings as errors during compilation" OFF)
option(WITH_CCACHE "Use ccache compiler invocation." OFF)
option(WITH_PROFILER "Enable profiler annotations." OFF)
Expand Down Expand Up @@ -280,11 +284,38 @@ find_package(MPI 3.0 REQUIRED)
find_package(MpiexecBackend)

# OpenMPI checks the number of processes against the number of CPUs
if("${MPIEXEC_BACKEND_NAME}" STREQUAL "OpenMPI" AND "${MPIEXEC_BACKEND_VERSION}"
VERSION_GREATER_EQUAL 2.0.0)
set(MPIEXEC_OVERSUBSCRIBE "-oversubscribe")
else()
set(MPIEXEC_OVERSUBSCRIBE "")
set(MPIEXEC_OVERSUBSCRIBE "")
# Open MPI 4.x has a bug on NUMA archs that prevents running in singleton mode
set(ESPRESSO_MPIEXEC_GUARD_SINGLETON_NUMA OFF)
set(ESPRESSO_CPU_MODEL_NAME_OMPI_SINGLETON_NUMA_PATTERN "AMD (EPYC|Ryzen)")

if("${MPIEXEC_BACKEND_NAME}" STREQUAL "OpenMPI")
if("${MPIEXEC_BACKEND_VERSION}" VERSION_GREATER_EQUAL 2.0.0)
set(MPIEXEC_OVERSUBSCRIBE "-oversubscribe")
endif()
if("${MPIEXEC_BACKEND_VERSION}" VERSION_GREATER_EQUAL 4.0
AND "${MPIEXEC_BACKEND_VERSION}" VERSION_LESS 5.0)
if(NOT DEFINED ESPRESSO_CPU_MODEL_NAME)
if(CMAKE_SYSTEM_NAME STREQUAL Linux)
if(EXISTS /proc/cpuinfo)
file(READ /proc/cpuinfo ESPRESSO_CPU_INFO)
string(REGEX
REPLACE ".*\n[Mm]odel name[ \t]*:[ \t]+([^\n]+).*" "\\1"
ESPRESSO_CPU_MODEL_NAME_STRING "${ESPRESSO_CPU_INFO}")
else()
set(ESPRESSO_CPU_MODEL_NAME_STRING "__unreadable")
endif()
else()
set(ESPRESSO_CPU_MODEL_NAME_STRING "__unaffected")
endif()
set(ESPRESSO_CPU_MODEL_NAME "${ESPRESSO_CPU_MODEL_NAME_STRING}"
CACHE INTERNAL "")
endif()
if(ESPRESSO_CPU_MODEL_NAME MATCHES
"^${ESPRESSO_CPU_MODEL_NAME_OMPI_SINGLETON_NUMA_PATTERN}")
set(ESPRESSO_MPIEXEC_GUARD_SINGLETON_NUMA ON)
endif()
endif()
endif()

# OpenMPI cannot run two jobs in parallel in a Docker container, because the
Expand Down
12 changes: 7 additions & 5 deletions cmake/unit_test.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@ function(UNIT_TEST)
else()
set(SANITIZERS_HALT_ON_ERROR "halt_on_error=0")
endif()
set(UBSAN_OPTIONS "UBSAN_OPTIONS=suppressions=${CMAKE_SOURCE_DIR}/maintainer/CI/ubsan.supp:${SANITIZERS_HALT_ON_ERROR}:print_stacktrace=1")
set(ASAN_OPTIONS "ASAN_OPTIONS=${SANITIZERS_HALT_ON_ERROR}:detect_leaks=0:allocator_may_return_null=1")
set(MSAN_OPTIONS "MSAN_OPTIONS=${SANITIZERS_HALT_ON_ERROR}")
list(APPEND TEST_ENV_VARIABLES "UBSAN_OPTIONS=suppressions=${CMAKE_SOURCE_DIR}/maintainer/CI/ubsan.supp:${SANITIZERS_HALT_ON_ERROR}:print_stacktrace=1")
list(APPEND TEST_ENV_VARIABLES "ASAN_OPTIONS=${SANITIZERS_HALT_ON_ERROR}:detect_leaks=0:allocator_may_return_null=1")
list(APPEND TEST_ENV_VARIABLES "MSAN_OPTIONS=${SANITIZERS_HALT_ON_ERROR}")
if(NOT TEST_NUM_PROC AND ESPRESSO_MPIEXEC_GUARD_SINGLETON_NUMA AND "${TEST_DEPENDS}" MATCHES "(^|;)([Bb]oost::mpi|MPI::MPI_CXX)($|;)")
list(APPEND TEST_ENV_VARIABLES "OMPI_MCA_hwloc_base_binding_policy=none")
endif()
set_tests_properties(
${TEST_NAME} PROPERTIES ENVIRONMENT
"${UBSAN_OPTIONS} ${ASAN_OPTIONS} ${MSAN_OPTIONS}")
${TEST_NAME} PROPERTIES ENVIRONMENT "${TEST_ENV_VARIABLES}")

add_dependencies(check_unit_tests ${TEST_NAME})
endfunction(UNIT_TEST)
14 changes: 14 additions & 0 deletions doc/sphinx/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ are required to be able to compile and use |es|:
Other MPI implementations like Intel MPI should also work, although
they are not actively tested in |es| continuous integration.

Open MPI version 4.x is known to not properly support the MCA binding
policy "numa" in singleton mode on a few NUMA architectures.
On affected systems, e.g. AMD Ryzen or AMD EPYC, Open MPI halts with
a fatal error when setting the processor affinity in ``MPI_Init``.
This issue can be resolved by setting the environment variable
``OMPI_MCA_hwloc_base_binding_policy`` to a value other than "numa",
such as "l3cache" to bind to a NUMA shared memory block, or to
"none" to disable binding (can cause performance loss).

Python
|es|'s main user interface relies on Python 3.

Expand Down Expand Up @@ -720,6 +729,11 @@ The following options are available:
* ``WITH_VALGRIND_INSTRUMENTATION``: Build with valgrind instrumentation
markers

* ``ESPRESSO_ADD_OMPI_SINGLETON_WARNING``: Add a runtime warning in the
pypresso and ipypresso scripts that is triggered in singleton mode
with Open MPI version 4.x on unsupported NUMA environments
(see :term:`MPI installation requirements <MPI>` for details).

When the value in the :file:`CMakeLists.txt` file is set to ON, the
corresponding option is created; if the value of the option is set to OFF,
the corresponding option is not created. These options can also be modified
Expand Down
3 changes: 1 addition & 2 deletions doc/sphinx/running.rst
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,7 @@ Parallel computing

Many algorithms in |es| are designed to work with multiple MPI ranks.
However, not all algorithms benefit from MPI parallelization equally.
Several algorithms only use MPI rank 0 (e.g. :ref:`Reaction methods`), while
a small subset simply don't support MPI (e.g. :ref:`Dipolar direct sum`).
Several algorithms only use MPI rank 0 (e.g. :ref:`Reaction methods`).
|es| should work with most MPI implementations on the market;
see the :term:`MPI installation requirements <MPI>` for details.

Expand Down
9 changes: 9 additions & 0 deletions src/python/pypresso.cmakein
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ else
fi
export PYTHONPATH

# Open MPI 4.x cannot run in singleton mode on some NUMA systems
if [ "@ESPRESSO_ADD_OMPI_SINGLETON_WARNING@" = "ON" ] && [ "@ESPRESSO_MPIEXEC_GUARD_SINGLETON_NUMA@" = "ON" ]; then
if [ -z "${OMPI_COMM_WORLD_SIZE}" ] && [ "${OMPI_MCA_hwloc_base_binding_policy}" = "numa" ]; then
if test -f /proc/cpuinfo && grep --quiet -P "^[Mm]odel name[ \t]*:[ \t]+@ESPRESSO_CPU_MODEL_NAME_OMPI_SINGLETON_NUMA_PATTERN@( |$)" /proc/cpuinfo; then
echo "warning: if Open MPI fails to set processor affinity, set environment variable OMPI_MCA_hwloc_base_binding_policy to \"none\" or \"l3cache\""
fi
fi
fi

if [ "@CMAKE_CXX_COMPILER_ID@" != "GNU" ] && [ "@WITH_ASAN@" = "ON" ]; then
asan_lib=$("@CMAKE_CXX_COMPILER@" /dev/null -### -o /dev/null -fsanitize=address 2>&1 | grep -o '[" ][^" ]*libclang_rt.asan[^" ]*[^s][" ]' | sed 's/[" ]//g' | sed 's/\.a$/.so/g')
export DYLD_INSERT_LIBRARIES="$asan_lib"
Expand Down
4 changes: 2 additions & 2 deletions src/script_interface/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ unit_test(NAME ParallelExceptionHandler_test SRC
unit_test(NAME packed_variant_test SRC packed_variant_test.cpp DEPENDS
Espresso::script_interface)
unit_test(NAME ObjectList_test SRC ObjectList_test.cpp DEPENDS
Espresso::script_interface Espresso::core)
Espresso::script_interface Espresso::core Boost::mpi)
unit_test(NAME ObjectMap_test SRC ObjectMap_test.cpp DEPENDS
Espresso::script_interface Espresso::core)
Espresso::script_interface Espresso::core Boost::mpi)
unit_test(NAME serialization_mpi_guard_test SRC
serialization_mpi_guard_test.cpp DEPENDS Espresso::script_interface
Boost::mpi MPI::MPI_CXX NUM_PROC 2)
Expand Down

0 comments on commit ac2332e

Please sign in to comment.