Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Subfiling VFD IOC assignment bug #3456

Merged
merged 1 commit into from
Sep 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions release_docs/RELEASE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,19 @@ Bug Fixes since HDF5-1.14.0 release
===================================
Library
-------
- Fixed a bug with the way the Subfiling VFD assigns I/O concentrators

During a file open operation, the Subfiling VFD determines the topology
of the application and uses that to select a subset of MPI ranks that
I/O will be forwarded to, called I/O concentrators. The code for this
had previously assumed that the parallel job launcher application (e.g.,
mpirun, srun, etc.) would distribute MPI ranks sequentially among a node
until all processors on that node have been assigned before going on to
the next node. When the launcher application mapped MPI ranks to nodes
in a different fashion, such as round-robin, this could cause the Subfiling
VFD to incorrectly map MPI ranks as I/O concentrators, leading to missing
subfiles.

- Fixed performance regression with some compound type conversions

In-place type conversion was introduced for most use cases in 1.14.2.
Expand Down
5 changes: 0 additions & 5 deletions src/H5FDsubfiling/H5FDsubfiling_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,6 @@
#include "H5subfiling_common.h"
#include "H5subfiling_err.h"

/*
* Some definitions for debugging the Subfiling VFD
*/
/* #define H5FD_SUBFILING_DEBUG */

#define DRIVER_INFO_MESSAGE_MAX_INFO 65536
#define DRIVER_INFO_MESSAGE_MAX_LENGTH 65552 /* MAX_INFO + sizeof(info_header_t) */

Expand Down
66 changes: 52 additions & 14 deletions src/H5FDsubfiling/H5subfiling_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,16 @@ static herr_t H5_free_subfiling_topology(sf_topology_t *topology);
static herr_t init_subfiling(const char *base_filename, uint64_t file_id,
H5FD_subfiling_params_t *subfiling_config, int file_acc_flags, MPI_Comm comm,
int64_t *context_id_out);
static herr_t init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm,
sf_topology_t **app_topology_out);
static herr_t init_app_topology(int64_t sf_context_id, H5FD_subfiling_params_t *subfiling_config,
MPI_Comm comm, MPI_Comm node_comm, sf_topology_t **app_topology_out);
static herr_t get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type,
char **ioc_sel_info_str);
static herr_t find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config,
long iocs_per_node, sf_topology_t **app_topology);
static herr_t init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm);
static herr_t gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm);
static int compare_layout_nodelocal(const void *layout1, const void *layout2);
static herr_t identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride);
static herr_t identify_ioc_ranks(int64_t sf_context_id, sf_topology_t *app_topology, int rank_stride);
static herr_t init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename,
uint64_t file_id, H5FD_subfiling_params_t *subfiling_config,
sf_topology_t *app_topology, MPI_Comm file_comm);
Expand Down Expand Up @@ -886,7 +886,7 @@ init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_param
* Setup the application topology information, including the computed
* number and distribution map of the set of I/O concentrators
*/
if (init_app_topology(subfiling_config, comm, node_comm, &app_topology) < 0)
if (init_app_topology(context_id, subfiling_config, comm, node_comm, &app_topology) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application topology");

new_context->sf_context_id = context_id;
Expand Down Expand Up @@ -938,8 +938,8 @@ init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_param
*-------------------------------------------------------------------------
*/
static herr_t
init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm,
sf_topology_t **app_topology_out)
init_app_topology(int64_t sf_context_id, H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm,
MPI_Comm node_comm, sf_topology_t **app_topology_out)
{
H5FD_subfiling_ioc_select_t ioc_selection_type;
sf_topology_t *app_topology = NULL;
Expand Down Expand Up @@ -1142,7 +1142,7 @@ init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_
* Determine which ranks are I/O concentrator ranks, based on the
* given IOC selection strategy and MPI information.
*/
if (identify_ioc_ranks(app_topology, rank_multiple) < 0)
if (identify_ioc_ranks(sf_context_id, app_topology, rank_multiple) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL,
"couldn't determine which MPI ranks are I/O concentrators");
}
Expand Down Expand Up @@ -1600,7 +1600,7 @@ compare_layout_nodelocal(const void *layout1, const void *layout2)
*-------------------------------------------------------------------------
*/
static herr_t
identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
identify_ioc_ranks(int64_t sf_context_id, sf_topology_t *app_topology, int rank_stride)
{
app_layout_t *app_layout = NULL;
int *io_concentrators = NULL;
Expand All @@ -1613,6 +1613,11 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
assert(app_topology->app_layout);
assert(app_topology->app_layout->layout);
assert(app_topology->app_layout->node_count > 0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add an assert for the maxval of node_count? It was out-of-range in the issue.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reasonable value we could use as a max for node count?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it can be greater than the number of ranks.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added

assert(app_topology->app_layout->node_count <= app_topology->app_layout->world_size);

#ifndef H5_SUBFILING_DEBUG
(void)sf_context_id;
#endif

app_layout = app_topology->app_layout;

Expand All @@ -1629,36 +1634,52 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
case SELECT_IOC_ONE_PER_NODE: {
int total_ioc_count = 0;
int iocs_per_node = 1;
int last_lead_rank;

if (app_topology->n_io_concentrators > app_layout->node_count)
iocs_per_node = app_topology->n_io_concentrators / app_layout->node_count;

assert(app_layout->node_ranks);
/*
* NOTE: The below code assumes that the app_layout->layout
* array was sorted according to the node_lead_rank field,
* such that entries for MPI ranks on the same node will occur
* together in the array.
*/

for (size_t i = 0; i < (size_t)app_layout->node_count; i++) {
int node_index = app_layout->node_ranks[i];
int local_size = app_layout->layout[node_index].node_local_size;
last_lead_rank = app_layout->layout[0].node_lead_rank;
for (size_t i = 0, layout_idx = 0; i < (size_t)app_layout->node_count; i++) {
int local_size = app_layout->layout[layout_idx].node_local_size;

/* Assign first I/O concentrator from this node */
assert(total_ioc_count < app_topology->n_io_concentrators);
io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank;
io_concentrators[total_ioc_count] = app_layout->layout[layout_idx++].rank;

if (app_layout->world_rank == io_concentrators[total_ioc_count]) {
assert(!app_topology->rank_is_ioc);

app_topology->ioc_idx = total_ioc_count;
app_topology->rank_is_ioc = TRUE;
}

total_ioc_count++;

/* Assign any additional I/O concentrators from this node */
for (size_t j = 1; j < (size_t)iocs_per_node; j++) {
if (total_ioc_count >= max_iocs)
break;
if (j >= (size_t)local_size)
break;

/* Sanity check to make sure this rank is on the same node */
assert(app_layout->layout[layout_idx].node_lead_rank ==
app_layout->layout[layout_idx - 1].node_lead_rank);

assert(total_ioc_count < app_topology->n_io_concentrators);
io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank;
io_concentrators[total_ioc_count] = app_layout->layout[layout_idx++].rank;

if (app_layout->world_rank == io_concentrators[total_ioc_count]) {
assert(!app_topology->rank_is_ioc);

app_topology->ioc_idx = total_ioc_count;
app_topology->rank_is_ioc = TRUE;
}
Expand All @@ -1668,8 +1689,25 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)

if (total_ioc_count >= max_iocs)
break;

/* Find the block of layout structures for the next node */
while ((layout_idx < (size_t)app_layout->world_size) &&
(last_lead_rank == app_layout->layout[layout_idx].node_lead_rank))
layout_idx++;

if (layout_idx >= (size_t)app_layout->world_size)
break;

last_lead_rank = app_layout->layout[layout_idx].node_lead_rank;
}

#ifdef H5_SUBFILING_DEBUG
if (app_topology->n_io_concentrators != total_ioc_count)
H5_subfiling_log(sf_context_id,
"%s: **WARN** Number of I/O concentrators adjusted from %d to %d", __func__,
app_topology->n_io_concentrators, total_ioc_count);
#endif

/* Set final number of I/O concentrators after adjustments */
app_topology->n_io_concentrators = total_ioc_count;

Expand Down
19 changes: 19 additions & 0 deletions src/H5FDsubfiling/H5subfiling_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,25 @@ typedef enum io_ops {
* will be broadcast to all MPI ranks and will
* provide a basis for determining which MPI ranks
* will host an I/O concentrator.
*
* - rank
* The MPI rank value for this processor
*
* - node_local_rank
* The MPI rank value for this processor in an MPI
* communicator that only involves MPI ranks on the
* same node as this processor
*
* - node_local_size
* The number of MPI ranks on the same node as this
* processor, including this processor itself
*
* - node_lead_rank
* The lowest MPI rank value for processors on the
* same node as this processor (possibly the MPI
* rank value for this processor); Denotes a "lead"
* MPI rank for certain operations.
*
*/
typedef struct {
int rank;
Expand Down