Skip to content

Commit

Permalink
Parallel rank0 deadlock fixes (#1183)
Browse files Browse the repository at this point in the history
* Fix several places where rank 0 can skip past collective MPI operations on failure

* Committing clang-format changes

Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
jhendersonHDF and github-actions[bot] authored Jan 22, 2022
1 parent d45124d commit 99d3962
Show file tree
Hide file tree
Showing 9 changed files with 180 additions and 71 deletions.
12 changes: 12 additions & 0 deletions release_docs/RELEASE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,18 @@ Bug Fixes since HDF5-1.12.0 release

(DER - 2021/11/23, HDFFV-11286)

- Fixed several potential MPI deadlocks in library failure conditions

In the parallel library, there were several places where MPI rank 0
could end up skipping past collective MPI operations when some failure
occurs in rank 0-specific processing. This would lead to deadlocks
where rank 0 completes an operation while other ranks wait in the
collective operation. These places have been rewritten to have rank 0
push an error and try to cleanup after the failure, then continue to
participate in the collective operation to the best of its ability.

(JTH - 2021/11/09)

- Fixed an issue with collective metadata reads being permanently disabled
after a dataset chunk lookup operation. This would usually cause a
mismatched MPI_Bcast and MPI_ERR_TRUNCATE issue in the library for
Expand Down
11 changes: 8 additions & 3 deletions src/H5AC.c
Original file line number Diff line number Diff line change
Expand Up @@ -1636,9 +1636,14 @@ H5AC_unprotect(H5F_t *f, const H5AC_class_t *type, haddr_t addr, void *thing, un
if (H5AC__log_dirtied_entry((H5AC_info_t *)thing) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "can't log dirtied entry")

if (deleted && aux_ptr->mpi_rank == 0)
if (H5AC__log_deleted_entry((H5AC_info_t *)thing) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "H5AC__log_deleted_entry() failed")
if (deleted && aux_ptr->mpi_rank == 0) {
if (H5AC__log_deleted_entry((H5AC_info_t *)thing) < 0) {
/* If we fail to log the deleted entry, push an error but still
* participate in a possible sync point ahead
*/
HDONE_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "H5AC__log_deleted_entry() failed")
}
}
} /* end if */
#endif /* H5_HAVE_PARALLEL */

Expand Down
97 changes: 61 additions & 36 deletions src/H5ACmpio.c
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,10 @@ H5AC__broadcast_candidate_list(H5AC_t *cache_ptr, unsigned *num_entries_ptr, had
* are used to receiving from process 0, and also load it
* into a buffer for transmission.
*/
if (H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.")
if (H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0) {
/* Push an error, but still participate in following MPI_Bcast */
HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.")
}
HDassert(chk_num_entries == num_entries);
HDassert(haddr_buf_ptr != NULL);

Expand Down Expand Up @@ -428,18 +430,23 @@ H5AC__broadcast_clean_list(H5AC_t *cache_ptr)

/* allocate a buffer to store the list of entry base addresses in */
buf_size = sizeof(haddr_t) * num_entries;
if (NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer")

/* Set up user data for callback */
udata.aux_ptr = aux_ptr;
udata.addr_buf_ptr = addr_buf_ptr;
udata.u = 0;

/* Free all the clean list entries, building the address list in the callback */
/* (Callback also removes the matching entries from the dirtied list) */
if (H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries")
if (NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) {
/* Push an error, but still participate in following MPI_Bcast */
HDONE_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer")
}
else {
/* Set up user data for callback */
udata.aux_ptr = aux_ptr;
udata.addr_buf_ptr = addr_buf_ptr;
udata.u = 0;

/* Free all the clean list entries, building the address list in the callback */
/* (Callback also removes the matching entries from the dirtied list) */
if (H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0) {
/* Push an error, but still participate in following MPI_Bcast */
HDONE_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries")
}
}

/* Now broadcast the list of cleaned entries */
if (MPI_SUCCESS !=
Expand Down Expand Up @@ -1448,8 +1455,10 @@ H5AC__receive_haddr_list(MPI_Comm mpi_comm, unsigned *num_entries_ptr, haddr_t *

/* allocate buffers to store the list of entry base addresses in */
buf_size = sizeof(haddr_t) * num_entries;
if (NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer")
if (NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) {
/* Push an error, but still participate in following MPI_Bcast */
HDONE_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer")
}

/* Now receive the list of candidate entries */
if (MPI_SUCCESS !=
Expand Down Expand Up @@ -1800,10 +1809,14 @@ H5AC__rsp__dist_md_write__flush_to_min_clean(H5F_t *f)

if (evictions_enabled) {
/* construct candidate list -- process 0 only */
if (aux_ptr->mpi_rank == 0)
if (aux_ptr->mpi_rank == 0) {
/* If constructing candidate list fails, push an error but still participate
* in collective operations during following candidate list propagation
*/
if (H5AC__construct_candidate_list(cache_ptr, aux_ptr, H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) <
0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.")
HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.")
}

/* propagate and apply candidate list -- all processes */
if (H5AC__propagate_and_apply_candidate_list(f) < 0)
Expand Down Expand Up @@ -1899,15 +1912,21 @@ H5AC__rsp__p0_only__flush(H5F_t *f)
aux_ptr->write_permitted = FALSE;

/* Check for error on the write operation */
if (result < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.")

/* this code exists primarily for the test bed -- it allows us to
* enforce POSIX semantics on the server that pretends to be a
* file system in our parallel tests.
*/
if (aux_ptr->write_done)
(aux_ptr->write_done)();
if (result < 0) {
/* If write operation fails, push an error but still participate
* in collective operations during following cache entry
* propagation
*/
HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.")
}
else {
/* this code exists primarily for the test bed -- it allows us to
* enforce POSIX semantics on the server that pretends to be a
* file system in our parallel tests.
*/
if (aux_ptr->write_done)
(aux_ptr->write_done)();
}
} /* end if */

/* Propagate cleaned entries to other ranks. */
Expand Down Expand Up @@ -2019,15 +2038,21 @@ H5AC__rsp__p0_only__flush_to_min_clean(H5F_t *f)
aux_ptr->write_permitted = FALSE;

/* Check for error on the write operation */
if (result < 0)
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.")

/* this call exists primarily for the test code -- it is used
* to enforce POSIX semantics on the process used to simulate
* reads and writes in t_cache.c.
*/
if (aux_ptr->write_done)
(aux_ptr->write_done)();
if (result < 0) {
/* If write operation fails, push an error but still participate
* in collective operations during following cache entry
* propagation
*/
HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.")
}
else {
/* this call exists primarily for the test code -- it is used
* to enforce POSIX semantics on the process used to simulate
* reads and writes in t_cache.c.
*/
if (aux_ptr->write_done)
(aux_ptr->write_done)();
}
} /* end if */

if (H5AC__propagate_flushed_and_still_clean_entries_list(f) < 0)
Expand Down
42 changes: 35 additions & 7 deletions src/H5C.c
Original file line number Diff line number Diff line change
Expand Up @@ -2307,9 +2307,14 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
H5MM_memcpy(((uint8_t *)entry_ptr->image_ptr) + entry_ptr->size, H5C_IMAGE_SANITY_VALUE,
H5C_IMAGE_EXTRA_SPACE);
#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
if (0 == mpi_rank)
if (H5C__generate_image(f, cache_ptr, entry_ptr) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image")
if (0 == mpi_rank) {
if (H5C__generate_image(f, cache_ptr, entry_ptr) < 0) {
/* If image generation fails, push an error but
* still participate in the following MPI_Bcast
*/
HDONE_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image")
}
}
} /* end if */
HDassert(entry_ptr->image_ptr);

Expand Down Expand Up @@ -7182,8 +7187,20 @@ H5C__load_entry(H5F_t *f,
#ifdef H5_HAVE_PARALLEL
if (!coll_access || 0 == mpi_rank) {
#endif /* H5_HAVE_PARALLEL */
if (H5F_block_read(f, type->mem_type, addr, len, image) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")

if (H5F_block_read(f, type->mem_type, addr, len, image) < 0) {

#ifdef H5_HAVE_PARALLEL
if (coll_access) {
/* Push an error, but still participate in following MPI_Bcast */
HDmemset(image, 0, len);
HDONE_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
}
else
#endif
HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
}

#ifdef H5_HAVE_PARALLEL
} /* end if */
/* if the collective metadata read optimization is turned on,
Expand Down Expand Up @@ -7230,8 +7247,19 @@ H5C__load_entry(H5F_t *f,
* loaded thing, go get the on-disk image again (the extra portion).
*/
if (H5F_block_read(f, type->mem_type, addr + len, actual_len - len, image + len) <
0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
0) {

#ifdef H5_HAVE_PARALLEL
if (coll_access) {
/* Push an error, but still participate in following MPI_Bcast */
HDmemset(image + len, 0, actual_len - len);
HDONE_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
}
else
#endif
HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
}

#ifdef H5_HAVE_PARALLEL
}
/* If the collective metadata read optimization is turned on,
Expand Down
8 changes: 2 additions & 6 deletions src/H5CX.c
Original file line number Diff line number Diff line change
Expand Up @@ -1397,9 +1397,7 @@ H5CX_set_apl(hid_t *acspl_id, const H5P_libclass_t *libclass,

/* If parallel is enabled and the file driver used is the MPI-IO
* VFD, issue an MPI barrier for easier debugging if the API function
* calling this is supposed to be called collectively. Note that this
* happens only when the environment variable H5_COLL_BARRIER is set
* to non 0.
* calling this is supposed to be called collectively.
*/
if (H5_coll_api_sanity_check_g) {
MPI_Comm mpi_comm; /* File communicator */
Expand Down Expand Up @@ -1456,9 +1454,7 @@ H5CX_set_loc(hid_t

/* If parallel is enabled and the file driver used is the MPI-IO
* VFD, issue an MPI barrier for easier debugging if the API function
* calling this is supposed to be called collectively. Note that this
* happens only when the environment variable H5_COLL_BARRIER is set
* to non 0.
* calling this is supposed to be called collectively.
*/
if (H5_coll_api_sanity_check_g) {
MPI_Comm mpi_comm; /* File communicator */
Expand Down
3 changes: 3 additions & 0 deletions src/H5Cimage.c
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,9 @@ H5C__read_cache_image(H5F_t *f, H5C_t *cache_ptr)
#endif /* H5_HAVE_PARALLEL */
/* Read the buffer (if serial access, or rank 0 of parallel access) */
/* NOTE: if this block read is being performed on rank 0 only, throwing
* an error here will cause other ranks to hang in the following MPI_Bcast.
*/
if (H5F_block_read(f, H5FD_MEM_SUPER, cache_ptr->image_addr, cache_ptr->image_len,
cache_ptr->image_buffer) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, "Can't read metadata cache image block")
Expand Down
13 changes: 10 additions & 3 deletions src/H5Dcontig.c
Original file line number Diff line number Diff line change
Expand Up @@ -278,9 +278,16 @@ H5D__contig_fill(const H5D_io_info_t *io_info)
if (using_mpi) {
/* Write the chunks out from only one process */
/* !! Use the internal "independent" DXPL!! -QAK */
if (H5_PAR_META_WRITE == mpi_rank)
if (H5D__contig_write_one(&ioinfo, offset, size) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to write fill value to dataset")
if (H5_PAR_META_WRITE == mpi_rank) {
if (H5D__contig_write_one(&ioinfo, offset, size) < 0) {
/* If writing fails, push an error and stop writing, but
* still participate in following MPI_Barrier.
*/
blocks_written = TRUE;
HDONE_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to write fill value to dataset")
break;
}
}

/* Indicate that blocks are being written */
blocks_written = TRUE;
Expand Down
12 changes: 10 additions & 2 deletions src/H5Dmpio.c
Original file line number Diff line number Diff line change
Expand Up @@ -2360,8 +2360,16 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")

if (mpi_rank == 0) {
if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address")
if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0) {
size_t u;

/* Clear total chunk address array */
for (u = 0; u < (size_t)fm->layout->u.chunk.nchunks; u++)
total_chunk_addr_array[u] = HADDR_UNDEF;

/* Push error, but still participate in following MPI_Bcast */
HDONE_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address")
}
} /* end if */

/* Broadcasting the MPI_IO option info. and chunk address info. */
Expand Down
Loading

0 comments on commit 99d3962

Please sign in to comment.