Skip to content

Commit

Permalink
Fix MPI on Ubuntu 18.04 with CUDA
Browse files Browse the repository at this point in the history
OpenMPI 2.1.1 has a broken vader BTL, requiring us to disable its single-copy mode
  • Loading branch information
mkuron committed Sep 18, 2018
1 parent c57103d commit cf1fe2a
Showing 1 changed file with 86 additions and 0 deletions.
86 changes: 86 additions & 0 deletions src/core/communication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,88 @@ int mpi_check_runtime_errors(void);
* procedures
**********************************************/

#if defined(OPEN_MPI) && defined(MPI_T_ERR_NOT_INITIALIZED)
/*! Workaround for "Read -1, expected XXXXXXX, errno = 14" that sometimes
appears when CUDA is used. This is a bug in OpenMPI 2.0-2.1.2 and 3.0.0
according to
https://www.mail-archive.com/users@lists.open-mpi.org/msg32357.html,
so we set btl_vader_single_copy_mechanism = none.
*/
static void openmpi_fix_vader() {
if (OMPI_MAJOR_VERSION < 2 || OMPI_MAJOR_VERSION > 3)
return;
if (OMPI_MAJOR_VERSION == 2 && OMPI_MINOR_VERSION == 1 &&
OMPI_RELEASE_VERSION >= 3)
return;
if (OMPI_MAJOR_VERSION == 3 &&
(OMPI_MINOR_VERSION > 0 || OMPI_RELEASE_VERSION > 0))
return;

const char *varname = "btl_vader_single_copy_mechanism";
const char *varval = "none";
int varvali = -1;

// initialize the MPI_T interface
int provided;
if (MPI_T_init_thread(MPI_THREAD_SINGLE, &provided) != MPI_SUCCESS)
return; // interface not available, so we can't do anything

// get the variable ID
int cvar;
if (MPI_T_cvar_get_index(varname, &cvar) != MPI_SUCCESS)
return; // only one rank is used or vader is disabled

// get a handle to the variable
MPI_T_cvar_handle cvar_handle;
int count;
if (MPI_T_cvar_handle_alloc(cvar, NULL, &cvar_handle, &count) !=
MPI_SUCCESS || count != 1) {
std::cerr << "Failed to allocate handle to " << varname << std::endl;
return;
}

// get the value index inside the enum
MPI_T_enum enumtype;
MPI_T_cvar_get_info(cvar, NULL, NULL, NULL, NULL, &enumtype, NULL, NULL, NULL,
NULL);
for (int i = 0; true; ++i) {
char name[10];
int name_len = sizeof(name);
int newval = -1;
if (MPI_T_enum_get_item(enumtype, i, &newval, name, &name_len) !=
MPI_SUCCESS)
break;
if (std::string(name) == varval)
varvali = newval;
}

// check whether we found a match inside the enum
if (varvali < 0) {
std::cerr << "Failed to find " << varval << " for " << varname << std::endl;
return;
}

// set the variable
if (MPI_T_cvar_write(cvar_handle, &varvali) != MPI_SUCCESS) {
std::cerr << "Failed to set " << varname << " to " << varval << " ("
<< varvali << ")" << std::endl;
}

// check whether we successfully set the variable
int varvali2 = -1;
if (MPI_T_cvar_read(cvar_handle, &varvali2) != MPI_SUCCESS ||
varvali != varvali2) {
std::cerr << "Value of " << varname << " does not match intended " << varval
<< std::endl;
return;
}

// clean up
MPI_T_cvar_handle_free(&cvar_handle);
MPI_T_finalize();
}
#endif

void mpi_init() {
#ifdef OPEN_MPI
void *handle = 0;
Expand Down Expand Up @@ -264,6 +346,10 @@ void mpi_init() {
Utils::make_unique<boost::mpi::environment>(argc, argv);
#endif

#if defined(OPEN_MPI) && defined(MPI_T_ERR_NOT_INITIALIZED)
openmpi_fix_vader();
#endif

MPI_Comm_size(MPI_COMM_WORLD, &n_nodes);
MPI_Dims_create(n_nodes, 3, node_grid);

Expand Down

0 comments on commit cf1fe2a

Please sign in to comment.