diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index 163353e4834..a7d34dc5240 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -12,4 +12,4 @@ jobs: - uses: codespell-project/actions-codespell@master with: skip: ./bin/trace,./hl/tools/h5watch/h5watch.c,./tools/test/h5jam/tellub.c - ignore_words_list: isnt,inout,nd,parms,parm,ba,offsetP,ser,ois,had,fiter,fo,clude,refere,minnum,offsetp,creat,ans:,eiter,lastr,ans,isn't,ifset,sur,trun,dne,tthe,hda,filname,te,htmp,minnum + ignore_words_list: isnt,inout,nd,parms,parm,ba,offsetP,ser,ois,had,fiter,fo,clude,refere,minnum,offsetp,creat,ans:,eiter,lastr,ans,isn't,ifset,sur,trun,dne,tthe,hda,filname,te,htmp,minnum,ake,gord,numer diff --git a/CMakeLists.txt b/CMakeLists.txt index fedce447f38..f76792fe003 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -738,6 +738,16 @@ if (H5_HAVE_PARALLEL) endif () endif () +# see other find_package examples in /usr/share/cmake/Modules +option (HDF5_BUILD_SUBFILING "Build Parallel HDF5 Subfiling" OFF) +if (HDF5_BUILD_SUBFILING AND HDF5_ENABLE_PARALLEL) + message (STATUS "Setting up to use Mercury components") + include_directories(${HDF5_SRC_DIR}/mercury/include) + set (WITH_SUBFILING 1) + set (H5_HAVE_MERCURY_H 1) + set (CMAKE_REQUIRED_INCLUDES "${HDF5_SRC_DIR}/mercury/include") +endif() + #option (DEFAULT_API_VERSION "Enable v1.14 API (v16, v18, v110, v112, v114)" "v114") set (DEFAULT_API_VERSION "v114" CACHE STRING "Enable v1.14 API (v16, v18, v110, v112, v114)") set_property (CACHE DEFAULT_API_VERSION PROPERTY STRINGS v16 v18 v110 v112 v114) diff --git a/MANIFEST b/MANIFEST index d45056280d7..a52357af2cc 100644 --- a/MANIFEST +++ b/MANIFEST @@ -810,6 +810,9 @@ ./src/H5FDhdfs.c ./src/H5FDhdfs.h ./src/H5FDint.c +./src/H5FDioc.c +./src/H5FDioc.h +./src/H5FDioc_threads.c ./src/H5FDlog.c ./src/H5FDlog.h ./src/H5FDmirror.c @@ -836,6 +839,11 @@ ./src/H5FDsplitter.c ./src/H5FDsplitter.h ./src/H5FDstdio.c +./src/H5FDsubfile_int.c +./src/H5FDsubfile_mpi.c +./src/H5FDsubfiling.c +./src/H5FDsubfiling.h +./src/H5FDsubfiling_priv.h ./src/H5FDstdio.h ./src/H5FDtest.c ./src/H5FDwindows.c @@ -1540,6 +1548,7 @@ ./testpar/t_pshutdown.c ./testpar/t_prestart.c ./testpar/t_span_tree.c +./testpar/t_subfiling_vfd.c ./testpar/t_vfd.c ./testpar/t_init_term.c ./testpar/t_2Gio.c @@ -3008,6 +3017,9 @@ ./utils/mirror_vfd/mirror_server_stop.c ./utils/mirror_vfd/mirror_writer.c +# Subfiling VFD utilities +./utils/subfiling_vfd/h5fuse.sh + # test utilities ./utils/test/Makefile.am ./utils/test/swmr_check_compat_vfd.c diff --git a/bin/trace b/bin/trace index 169719cf272..a74c9101164 100755 --- a/bin/trace +++ b/bin/trace @@ -88,6 +88,7 @@ $Source = ""; "H5_index_t" => "Ii", "H5I_iterate_func_t" => "II", "H5_iter_order_t" => "Io", + "ioc_selection_t" => "IO", "H5I_future_realize_func_t" => "IR", "int" => "Is", "int32_t" => "Is", @@ -188,6 +189,7 @@ $Source = ""; "H5Z_filter_t" => "Zf", "H5Z_filter_func_t" => "ZF", "ssize_t" => "Zs", + # Types below must be defined here, as they appear in function arguments, # but they are not yet supported in the H5_trace_args() routine yet. If # they are used as an actual parameter type (and not just as a pointer to diff --git a/c++/src/H5FaccProp.cpp b/c++/src/H5FaccProp.cpp index a79ada3d559..1edf4ad5a25 100644 --- a/c++/src/H5FaccProp.cpp +++ b/c++/src/H5FaccProp.cpp @@ -156,7 +156,7 @@ FileAccPropList::getDriver() const // Function: FileAccPropList::setDriver ///\brief Set file driver for this property list. ///\param new_driver_id - IN: File driver -///\param new_driver_info - IN: Struct containing the driver-specific properites +///\param new_driver_info - IN: Struct containing the driver-specific properties ///\exception H5::PropListIException ///\par Description /// For information, please refer to the H5Pset_driver API in diff --git a/config/sanitizer/LICENSE b/config/sanitizer/LICENSE index 895657b9a96..5a6dc669989 100644 --- a/config/sanitizer/LICENSE +++ b/config/sanitizer/LICENSE @@ -124,7 +124,7 @@ may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with + reproduction, and distribution of the Work otherwise compiles with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, diff --git a/configure.ac b/configure.ac index 4cf329c2f13..9552d3ce1be 100644 --- a/configure.ac +++ b/configure.ac @@ -1584,6 +1584,68 @@ case "X-$withval" in esac +## ---------------------------------------------------------------------- +## Is mercury(an external RPC Framework library) requested? It has a +## header file `mercury.h' and a library `-lmercury_util' and their +## install locations might be specified with the `--with-mercury' +## command-line switch. The value is an include path and/or a library path. +## If the library path is specified then it must be preceded by a comma. +## +AC_SUBST([HAVE_MERCURY]) + +## Default is not present +HAVE_MERCURY=no + +AC_ARG_WITH([mercury], + [AS_HELP_STRING([--with-mercury=DIR], + [Use the mercury library [default=no]])],, + [withval=no]) + +case "X-$withval" in + X-yes) + HAVE_MERCURY="yes" + if test -z "$HAVE_MERCURY" -a -n "$HDF5_CONFIG_ABORT"; then + AC_MSG_ERROR([couldn't find mercury library]) + fi + echo "HAVE_MERCURY flag = $HAVE_MERCURY" + ;; + X-|X-no|X-none) + HAVE_MERCURY="no" + AC_MSG_CHECKING([for mercury library]) + AC_MSG_RESULT([suppressed]) + ;; + *) + HAVE_MERCURY="yes" + case "$withval" in + *,*) + mercury_inc="`echo $withval |cut -f1 -d,`" + mercury_lib="`echo $withval |cut -f2 -d, -s`" + ;; + *) + if test -n "$withval"; then + mercury_inc="$withval/include" + mercury_lib="$withval/lib" + fi + ;; + esac + + echo "checking include path: $mercury_inc" + saved_CPPFLAGS="$CPPFLAGS" + saved_AM_CPPFLAGS="$AM_CPPFLAGS" + saved_LDFLAGS="$LDFLAGS" + saved_AM_LDFLAGS="$AM_LDFLAGS" + + if test -n "$mercury_inc"; then + CPPFLAGS="$CPPFLAGS -I$mercury_inc" + AM_CPPFLAGS="$AM_CPPFLAGS -I$mercury_inc" + fi + + AC_CHECK_HEADERS([mercury.h],,[CPPFLAGS="$saved_CPPFLAGS"; AM_CPPFLAGS="$saved_AM_CPPFLAGS"] [unset HAVE_MERCURY]) + ;; +esac + +AM_CONDITIONAL([HAVE_MERCURY_CONDITIONAL], [test "X$HAVE_MERCURY" = "Xyes"]) + ## ---------------------------------------------------------------------- ## Make the external filters list available to *.in files ## At this point it's unset (no external filters by default) but it @@ -3043,6 +3105,7 @@ if test -n "$PARALLEL"; then fi ## ---------------------------------------------------------------------- + ## Build parallel tools if parallel tools, parallel, and build tools options ## are all enabled. ## @@ -3200,6 +3263,34 @@ else AC_MSG_RESULT([no]) fi +## ---------------------------------------------------------------------- +## Check if Subfiling I/O driver is enabled by --enable-subfiling-vfd +## +AC_SUBST([SUBFILING_VFD]) + +## Default is no direct VFD +SUBFILING_VFD=no + +AC_MSG_CHECKING([if the subfiling I/O virtual file driver (VFD) is enabled]) + +AC_ARG_ENABLE([subfiling-vfd], + [AS_HELP_STRING([--enable-subfiling-vfd], + [Build the subfiling I/O virtual file driver (VFD). + [default=no]])], + [SUBFILING_VFD=$enableval], [SUBFILING_VFD=no]) + +if test "X$SUBFILING_VFD" = "Xyes"; then + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_SUBFILING], [1], + [Define if the subfiling I/O virtual file driver (VFD) should be compiled]) +else + AC_MSG_RESULT([no]) +fi + +## Subfiling VFD files are not built if not required. +AM_CONDITIONAL([SUBFILING_VFD_CONDITIONAL], [test "X$SUBFILING_VFD" = "Xyes"]) + + ## ---------------------------------------------------------------------- ## Check if Direct I/O driver is enabled by --enable-direct-vfd ## diff --git a/fortran/test/tH5P.F90 b/fortran/test/tH5P.F90 index 75e4e72f5ed..d664dd73845 100644 --- a/fortran/test/tH5P.F90 +++ b/fortran/test/tH5P.F90 @@ -541,7 +541,7 @@ SUBROUTINE test_chunk_cache(cleanup, total_error) CALL H5Dclose_f(dsid, error) CALL H5Oopen_f(fid, "dset", dsid, error, dapl1) - ! Retrieve dapl from dataset, verfiy cache values are the same as on dapl1 + ! Retrieve dapl from dataset, verify cache values are the same as on dapl1 ! ! Note we rely on the knowledge that H5Pget_chunk_cache retrieves these ! values directly from the dataset structure, and not from a copy of the @@ -563,7 +563,7 @@ SUBROUTINE test_chunk_cache(cleanup, total_error) CALL H5Oopen_f(fid, "dset", dsid, error) CALL check("H5Oopen_f", error, total_error) - ! Retrieve dapl from dataset, verfiy cache values are the same as on fapl_local + ! Retrieve dapl from dataset, verify cache values are the same as on fapl_local CALL H5Dget_access_plist_f(dsid, dapl2, error) CALL check("H5Dget_access_plist_f", error, total_error) diff --git a/release_docs/HISTORY-1_0-1_8_0_rc3.txt b/release_docs/HISTORY-1_0-1_8_0_rc3.txt index f54ba6431b0..3669f4d9561 100644 --- a/release_docs/HISTORY-1_0-1_8_0_rc3.txt +++ b/release_docs/HISTORY-1_0-1_8_0_rc3.txt @@ -1245,7 +1245,7 @@ Known Problems causes failures in several HDF5 library tests. * For HPUX 11.23 many tools tests failed for 64-bit version when linked to the shared libraries (tested for 1.8.0-beta2) -* For SNL, Red Storm: only paralle HDF5 is supported. The serial tests pass +* For SNL, Red Storm: only parallel HDF5 is supported. The serial tests pass and the parallel tests also pass with lots of non-fatal error messages. * For LLNL, uP: both serial and parallel pass. Zeus: serial passes but parallel fails with a known proglem in MPI. ubgl: serial passes but diff --git a/release_docs/HISTORY-1_8_0-1_10_0.txt b/release_docs/HISTORY-1_8_0-1_10_0.txt index 575d070b718..7b84fbcf428 100644 --- a/release_docs/HISTORY-1_8_0-1_10_0.txt +++ b/release_docs/HISTORY-1_8_0-1_10_0.txt @@ -1581,7 +1581,7 @@ Known Problems causes failures in several HDF5 library tests. * For HPUX 11.23 many tools tests failed for 64-bit version when linked to the shared libraries (tested for 1.8.0-beta2) -* For SNL, Red Storm: only paralle HDF5 is supported. The serial tests pass +* For SNL, Red Storm: only parallel HDF5 is supported. The serial tests pass and the parallel tests also pass with lots of non-fatal error messages. * on SUN 5.10 C++ test fails in the "Testing Shared Datatypes with Attributes" test * configuring with --enable-debug=all produces compiler errors on most diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index da99dc62a7e..296ca65cb18 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -227,6 +227,39 @@ set (H5FA_HDRS ) IDE_GENERATED_PROPERTIES ("H5FA" "${H5FA_HDRS}" "${H5FA_SOURCES}" ) +set (MERCURY_UTIL_SOURCES + ${HDF5_SRC_DIR}/mercury/src/util/mercury_atomic_queue.c + ${HDF5_SRC_DIR}/mercury/src/util/mercury_dlog.c + ${HDF5_SRC_DIR}/mercury/src/util/mercury_event.c + ${HDF5_SRC_DIR}/mercury/src/util/mercury_hash_table.c + ${HDF5_SRC_DIR}/mercury/src/util/mercury_log.c + ${HDF5_SRC_DIR}/mercury/src/util/mercury_mem.c + ${HDF5_SRC_DIR}/mercury/src/util/mercury_mem_pool.c + ${HDF5_SRC_DIR}/mercury/src/util/mercury_poll.c + ${HDF5_SRC_DIR}/ mercury/src/util/mercury_request.c + ${HDF5_SRC_DIR}/mercury/src/util/mercury_thread.c + ${HDF5_SRC_DIR}/ mercury/src/util/mercury_thread_condition.c + ${HDF5_SRC_DIR}/mercury/src/util/mercury_thread_pool.c + ${HDF5_SRC_DIR}/ mercury/src/util/mercury_thread_mutex.c + ${HDF5_SRC_DIR}/mercury/src/util/mercury_thread_rwlock.c + ${HDF5_SRC_DIR}/ mercury/src/util/mercury_thread_spin.c + ${HDF5_SRC_DIR}/mercury/src/util/mercury_util.c +) + +set (H5FD_SUBFILING_SOURCES + ${HDF5_SRC_DIR}/H5FDioc.c + ${HDF5_SRC_DIR}/H5FDioc_threads.c + ${HDF5_SRC_DIR}/H5FDsubfiling.c + ${HDF5_SRC_DIR}/H5FDsubfile_int.c + ${HDF5_SRC_DIR}/H5FDsubfile_mpi.c + ${MERCURY_UTIL_SOURCES} +) + +set (H5FD_SUBFILING_HDRS + ${HDF5_SRC_DIR}/H5FDioc.h + ${HDF5_SRC_DIR}/H5FDsubfiling.h + ${HDF5_SRC_DIR}/H5FDsubfile_int.h +) set (H5FD_SOURCES ${HDF5_SRC_DIR}/H5FD.c @@ -764,6 +797,12 @@ set (H5_MODULE_HEADERS ${HDF5_SRC_DIR}/H5Zmodule.h ) +set (subfile_SOURCES ) +if (WITH_SUBFILING) + message (STATUS "Appending to common_SRCS ${H5FD_SUBFILING_SOURCES}") + list(APPEND subfile_SOURCES ${H5FD_SUBFILING_SOURCES}) +endif(WITH_SUBFILING) + set (common_SRCS ${H5_SOURCES} ${H5A_SOURCES} @@ -810,8 +849,11 @@ set (common_SRCS ${H5VM_SOURCES} ${H5WB_SOURCES} ${H5Z_SOURCES} + ${subfile_SOURCES} ) + + set (H5_PUBLIC_HEADERS ${H5_HDRS} ${H5A_HDRS} @@ -848,6 +890,7 @@ set (H5_PUBLIC_HEADERS ${H5TS_HDRS} ${H5VL_HDRS} ${H5Z_HDRS} + ${subfile_HDRS} ) set (H5_PRIVATE_HEADERS diff --git a/src/H5B2internal.c b/src/H5B2internal.c index c00f555dfd4..a8192df7768 100644 --- a/src/H5B2internal.c +++ b/src/H5B2internal.c @@ -17,7 +17,7 @@ * Dec 01 2016 * Quincey Koziol * - * Purpose: Routines for managing v2 B-tree internal ndoes. + * Purpose: Routines for managing v2 B-tree internal nodes. * *------------------------------------------------------------------------- */ diff --git a/src/H5B2leaf.c b/src/H5B2leaf.c index 20ace84051b..f48cf5b522c 100644 --- a/src/H5B2leaf.c +++ b/src/H5B2leaf.c @@ -17,7 +17,7 @@ * Dec 01 2016 * Quincey Koziol * - * Purpose: Routines for managing v2 B-tree leaf ndoes. + * Purpose: Routines for managing v2 B-tree leaf nodes. * *------------------------------------------------------------------------- */ diff --git a/src/H5ES.c b/src/H5ES.c index 9abaa545bda..ad42000de04 100644 --- a/src/H5ES.c +++ b/src/H5ES.c @@ -269,7 +269,7 @@ H5ESget_requests(hid_t es_id, H5_iter_order_t order, hid_t *connector_ids, void herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_API(FAIL) - H5TRACE5("e", "iIo*i**xx", es_id, order, connector_ids, requests, count); + H5TRACE6("e", "iIo*i**xzx", es_id, order, connector_ids, requests, array_len, count); /* Check arguments */ if (NULL == (es = H5I_object_verify(es_id, H5I_EVENTSET))) diff --git a/src/H5FDioc.c b/src/H5FDioc.c new file mode 100644 index 00000000000..13553f4efa6 --- /dev/null +++ b/src/H5FDioc.c @@ -0,0 +1,1483 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Purpose: The IOC VFD implements a file driver which relays all the + * VFD calls to an underlying VFD, and send all the write calls to + * another underlying VFD. Maintains two files simultaneously. + */ + +/* This source code file is part of the H5FD driver module */ +#include "H5FDdrvr_module.h" + +#include "H5FDpublic.h" /* Basic H5FD definitions */ +#include "H5Eprivate.h" /* Error handling */ +#include "H5FDprivate.h" /* File drivers */ +#include "H5FDioc.h" /* IOC file driver */ +#include "H5FLprivate.h" /* Free Lists */ +#include "H5Fprivate.h" /* File access */ +#include "H5Iprivate.h" /* IDs */ +#include "H5MMprivate.h" /* Memory management */ +#include "H5Pprivate.h" /* Property lists */ +#include "H5private.h" /* Generic Functions */ + +#if 1 /* JRM */ /* For now, H5FDsubfiling_priv.h needs mercury. Since the code that needs it will \ + * move to its own header, just hack it for now. \ + */ +#include "mercury_thread.h" +#include "mercury_thread_mutex.h" +#include "mercury_thread_pool.h" +#endif /* JRM */ + +#include "H5FDsubfiling_priv.h" + +/* The driver identification number, initialized at runtime */ +static hid_t H5FD_IOC_g = 0; +#if 0 /* JRM */ /* delete if all goes well */ +extern volatile int sf_shutdown_flag; +#endif /* JRM */ + +/* + * These macros check for overflow of various quantities. These macros + * assume that HDoff_t is signed and haddr_t and size_t are unsigned. + * + * ADDR_OVERFLOW: Checks whether a file address of type `haddr_t' + * is too large to be represented by the second argument + * of the file seek function. + * + * SIZE_OVERFLOW: Checks whether a buffer size of type `hsize_t' is too + * large to be represented by the `size_t' type. + * + * REGION_OVERFLOW: Checks whether an address and size pair describe data + * which can be addressed entirely by the second + * argument of the file seek function. + */ +#define MAXADDR (((haddr_t)1 << (8 * sizeof(HDoff_t) - 1)) - 1) +#define ADDR_OVERFLOW(A) (HADDR_UNDEF == (A) || ((A) & ~(haddr_t)MAXADDR)) +#define SIZE_OVERFLOW(Z) ((Z) & ~(hsize_t)MAXADDR) +#define REGION_OVERFLOW(A, Z) \ + (ADDR_OVERFLOW(A) || SIZE_OVERFLOW(Z) || HADDR_UNDEF == (A) + (Z) || (HDoff_t)((A) + (Z)) < (HDoff_t)(A)) + +#define H5FD_IOC_DEBUG_OP_CALLS 0 /* debugging print toggle; 0 disables */ + +#if H5FD_IOC_DEBUG_OP_CALLS +#define H5FD_IOC_LOG_CALL(name) \ + do { \ + HDprintf("called %s()\n", (name)); \ + HDfflush(stdout); \ + } while (0) +#else +#define H5FD_IOC_LOG_CALL(name) /* no-op */ +#endif /* H5FD_IOC_DEBUG_OP_CALLS */ + +/* Public functions which are referenced but not found in this file */ +extern herr_t H5FD__write_vector_internal(hid_t h5_fid, hssize_t count, haddr_t addrs[], size_t sizes[], + const void *bufs[] /* data_in */); +extern herr_t H5FD__read_vector_internal(hid_t h5_fid, hssize_t count, haddr_t addrs[], size_t sizes[], + void *bufs[] /* data_out */); +extern int H5FD__close_subfiles(int64_t context_id); +extern int H5FD__open_subfiles(void *_config_info, uint64_t h5_file_id, int flags); +extern hid_t fid_map_to_context(hid_t sf_fid); +extern subfiling_context_t *get__subfiling_object(int64_t context_id); + +/* Private functions */ +/* Prototypes */ +static herr_t H5FD__ioc_term(void); +static hsize_t H5FD__ioc_sb_size(H5FD_t *_file); +static herr_t H5FD__ioc_sb_encode(H5FD_t *_file, char *name /*out*/, unsigned char *buf /*out*/); +static herr_t H5FD__ioc_sb_decode(H5FD_t *_file, const char *name, const unsigned char *buf); +static void * H5FD__ioc_fapl_get(H5FD_t *_file); +static void * H5FD__ioc_fapl_copy(const void *_old_fa); +static herr_t H5FD__ioc_fapl_free(void *_fapl); +static H5FD_t *H5FD__ioc_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr); +static herr_t H5FD__ioc_close(H5FD_t *_file); +static int H5FD__ioc_cmp(const H5FD_t *_f1, const H5FD_t *_f2); +static herr_t H5FD__ioc_query(const H5FD_t *_file, unsigned long *flags /* out */); +static herr_t H5FD__ioc_get_type_map(const H5FD_t *_file, H5FD_mem_t *type_map); +static haddr_t H5FD__ioc_alloc(H5FD_t *file, H5FD_mem_t type, hid_t dxpl_id, hsize_t size); +static herr_t H5FD__ioc_free(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, hsize_t size); +static haddr_t H5FD__ioc_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type); +static herr_t H5FD__ioc_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr); +static haddr_t H5FD__ioc_get_eof(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type); +static herr_t H5FD__ioc_get_handle(H5FD_t *_file, hid_t H5_ATTR_UNUSED fapl, void **file_handle); +static herr_t H5FD__ioc_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, + void *buf); +static herr_t H5FD__ioc_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, + const void *buf); +static herr_t H5FD__ioc_read_vector(H5FD_t *file, hid_t dxpl_id, uint32_t count, H5FD_mem_t types[], + haddr_t addrs[], size_t sizes[], void *bufs[] /* out */); +static herr_t H5FD__ioc_write_vector(H5FD_t *file, hid_t dxpl_id, uint32_t count, H5FD_mem_t types[], + haddr_t addrs[], size_t sizes[], const void *bufs[] /* in */); +static herr_t H5FD__ioc_flush(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); +static herr_t H5FD__ioc_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); +static herr_t H5FD__ioc_lock(H5FD_t *_file, hbool_t rw); +static herr_t H5FD__ioc_unlock(H5FD_t *_file); +/* +static herr_t H5FD__ioc_ctl(H5FD_t *file, uint64_t op_code, uint64_t flags, + const void *input, void **result); +*/ + +static const H5FD_class_t H5FD_ioc_g = { + H5FD_IOC_VALUE, /* value */ + "ioc", /* name */ + MAXADDR, /* maxaddr */ + H5F_CLOSE_WEAK, /* fc_degree */ + H5FD__ioc_term, /* terminate */ + H5FD__ioc_sb_size, /* sb_size */ + H5FD__ioc_sb_encode, /* sb_encode */ + H5FD__ioc_sb_decode, /* sb_decode */ + sizeof(H5FD_ioc_config_t), /* fapl_size */ + H5FD__ioc_fapl_get, /* fapl_get */ + H5FD__ioc_fapl_copy, /* fapl_copy */ + H5FD__ioc_fapl_free, /* fapl_free */ + 0, /* dxpl_size */ + NULL, /* dxpl_copy */ + NULL, /* dxpl_free */ + H5FD__ioc_open, /* open */ + H5FD__ioc_close, /* close */ + H5FD__ioc_cmp, /* cmp */ + H5FD__ioc_query, /* query */ + H5FD__ioc_get_type_map, /* get_type_map */ + H5FD__ioc_alloc, /* alloc */ + H5FD__ioc_free, /* free */ + H5FD__ioc_get_eoa, /* get_eoa */ + H5FD__ioc_set_eoa, /* set_eoa */ + H5FD__ioc_get_eof, /* get_eof */ + H5FD__ioc_get_handle, /* get_handle */ + H5FD__ioc_read, /* read */ + H5FD__ioc_write, /* write */ + H5FD__ioc_read_vector, /* read_vector */ + H5FD__ioc_write_vector, /* write_vector */ + NULL, /* read_selection */ + NULL, /* write_selection */ + H5FD__ioc_flush, /* flush */ + H5FD__ioc_truncate, /* truncate */ + H5FD__ioc_lock, /* lock */ + H5FD__ioc_unlock, /* unlock */ + NULL, /* del */ + NULL, /* ctl */ + H5FD_FLMAP_DICHOTOMY /* fl_map */ +}; + +/* Declare a free list to manage the H5FD_ioc_t struct */ +H5FL_DEFINE_STATIC(H5FD_ioc_t); + +/* Declare a free list to manage the H5FD_ioc_fapl_t struct */ +H5FL_DEFINE_STATIC(H5FD_ioc_config_t); + +/*------------------------------------------------------------------------- + * Function: H5FD__init_package + * + * Purpose: Initializes any interface-specific data or routines. + * + * Return: SUCCEED/FAIL + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__init_package(void) +{ + herr_t ret_value = SUCCEED; + FUNC_ENTER_NOAPI(FAIL) + + H5FD_IOC_LOG_CALL(FUNC); + +#if 1 /* JRM */ + if (H5I_VFL != H5I_get_type(H5FD_IOC_g)) + H5FD_IOC_g = H5FD_register(&H5FD_ioc_g, sizeof(H5FD_class_t), FALSE); +#else /* JRM */ + if (H5I_VFL != H5I_get_type(H5FD_IOC_g)) { + HDfprintf(stdout, "H5FD_ioc_init(): calling H5FD_register()\n"); + H5FD_IOC_g = H5FD_register(&H5FD_ioc_g, sizeof(H5FD_class_t), FALSE); + } +#endif /* JRM */ + +#if 0 /* JRM */ + HDfprintf(stdout, "H5FD_ioc_init() IOC registered. id = %lld \n", (int64_t)H5FD_IOC_g); +#endif /* JRM */ + + if (H5I_INVALID_HID == H5FD_IOC_g) + HGOTO_ERROR(H5E_ID, H5E_CANTREGISTER, H5I_INVALID_HID, "unable to register file driver ID") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5FD__init_package() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_ioc_init + * + * Purpose: Initialize the ioc driver by registering it with the + * library. + * + * Return: Success: The driver ID for the ioc driver. + * Failure: Negative + *------------------------------------------------------------------------- + */ +hid_t +H5FD_ioc_init(void) +{ + hid_t ret_value = H5I_INVALID_HID; + + FUNC_ENTER_NOAPI(FAIL) + + H5FD_IOC_LOG_CALL(FUNC); + + if (H5I_VFL != H5I_get_type(H5FD_IOC_g)) + H5FD_IOC_g = H5FDregister(&H5FD_ioc_g); + + ret_value = H5FD_IOC_g; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_ioc_init() */ + +#if 0 /* JRM */ /* delete if all goes well */ +/*------------------------------------------------------------------------- + * Function: H5FD_ioc_set_shutdown_flag + * + * Purpose: IO Concentrator threads are told to terminate their service + * loop and exit by setting 'shutdown_flag' to a non-zero + * value. + * + * Return: None + * + *------------------------------------------------------------------------- + */ +void +H5FD_ioc_set_shutdown_flag(int flag) +{ + sf_shutdown_flag = flag; + if (H5FD_IOC_g > 0) + usleep(100); + return; +} /* end H5FD_ioc_set_shutdown_flag() */ +#endif /* JRM */ + +/*--------------------------------------------------------------------------- + * Function: H5FD__ioc_term + * + * Purpose: Shut down the ioc VFD. + * + * Returns: SUCCEED (Can't fail) + *--------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_term(void) +{ + FUNC_ENTER_NOAPI_NOINIT_NOERR + // FUNC_ENTER_STATIC_NOERR + +#if 0 /* JRM */ + HDfprintf(stdout, "Entering H5FD__ioc_term().\n"); +#endif /* JRM */ + + H5FD_IOC_LOG_CALL(FUNC); + + /* Reset VFL ID */ + H5FD_IOC_g = 0; + +#if 0 /* JRM */ + HDfprintf(stdout, "Exiting H5FD__ioc_term().\n"); +#endif /* JRM */ + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* end H5FD__ioc_term() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__copy_plist + * + * Purpose: Sanity-wrapped H5P_copy_plist() for each channel. + * Utility function for operation in multiple locations. + * + * Return: 0 on success, -1 on error. + *------------------------------------------------------------------------- + */ +static int +H5FD__copy_plist(hid_t fapl_id, hid_t *id_out_ptr) +{ + int ret_value = 0; + H5P_genplist_t *plist_ptr = NULL; + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + HDassert(id_out_ptr != NULL); + + if (FALSE == H5P_isa_class(fapl_id, H5P_FILE_ACCESS)) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, -1, "not a file access property list"); + + plist_ptr = (H5P_genplist_t *)H5I_object(fapl_id); + if (NULL == plist_ptr) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, -1, "unable to get property list"); + + *id_out_ptr = H5P_copy_plist(plist_ptr, FALSE); + if (H5I_INVALID_HID == *id_out_ptr) + HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, -1, "unable to copy file access property list"); + +done: + FUNC_LEAVE_NOAPI(ret_value); +} /* end H5FD__copy_plist() */ + +/*------------------------------------------------------------------------- + * Function: H5Pset_fapl_ioc + * + * Purpose: Sets the file access property list to use the + * ioc driver. + * + * Return: SUCCEED/FAIL + *------------------------------------------------------------------------- + */ +herr_t +H5Pset_fapl_ioc(hid_t fapl_id, H5FD_ioc_config_t *vfd_config) +{ + H5FD_ioc_config_t *info = NULL; + H5P_genplist_t * plist_ptr = NULL; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_API(FAIL) + H5TRACE2("e", "i*!", fapl_id, vfd_config); + + H5FD_IOC_LOG_CALL(FUNC); + + if (H5FD_IOC_FAPL_T_MAGIC != vfd_config->common.magic) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid configuration (magic number mismatch)") + if (H5FD_CURR_IOC_FAPL_T_VERSION != vfd_config->common.version) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid config (version number mismatch)") + if (NULL == (plist_ptr = (H5P_genplist_t *)H5I_object(fapl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a valid property list") + + info = H5FL_CALLOC(H5FD_ioc_config_t); + if (NULL == info) + HGOTO_ERROR(H5E_VFL, H5E_CANTALLOC, FAIL, "unable to allocate file access property list struct") + + memcpy(info, vfd_config, sizeof(H5FD_ioc_config_t)); + info->common.ioc_fapl_id = fapl_id; + ret_value = H5P_set_driver(plist_ptr, H5FD_IOC, info, NULL); + +done: + if (info) + info = H5FL_FREE(H5FD_ioc_config_t, info); + + FUNC_LEAVE_API(ret_value) +} /* end H5Pset_fapl_ioc() */ + +/*------------------------------------------------------------------------- + * Function: fapl_get_ioc_defaults + * + * Purpose: This is called by H5Pget_fapl_ioc when called with no + * established configuration info. This simply fills in + * in the basics. This avoids the necessity of having + * the user write code to initialize the config structure. + * + * Return: SUCCEED/FAIL + *------------------------------------------------------------------------- + */ +static herr_t +fapl_get_ioc_defaults(H5FD_ioc_config_t *fa) +{ + herr_t ret_value = SUCCEED; + + fa->common.magic = H5FD_IOC_FAPL_T_MAGIC; + fa->common.version = H5FD_CURR_IOC_FAPL_T_VERSION; + fa->common.ioc_fapl_id = H5P_DEFAULT; + fa->common.stripe_count = 0; + fa->common.stripe_depth = H5FD_DEFAULT_STRIPE_DEPTH; + fa->common.ioc_selection = SELECT_IOC_ONE_PER_NODE; + + /* Specific to this IO Concentrator */ + fa->thread_pool_count = H5FD_IOC_THREAD_POOL_SIZE; + return (ret_value); +} /* end fapl_get_ioc_defaults() */ + +/*------------------------------------------------------------------------- + * Function: H5Pget_fapl_ioc + * + * Purpose: Returns information about the ioc file access property + * list through the structure config_out. + * + * Will fail if config_out is received without pre-set valid + * magic and version information. + * + * Return: SUCCEED/FAIL + *------------------------------------------------------------------------- + */ +herr_t +H5Pget_fapl_ioc(hid_t fapl_id, H5FD_ioc_config_t *config_out) +{ + const H5FD_ioc_config_t *config_ptr = NULL; + H5P_genplist_t * plist_ptr = NULL; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_API(FAIL) + H5TRACE2("e", "i*!", fapl_id, config_out); + + H5FD_IOC_LOG_CALL(FUNC); + + /* Check arguments */ + if (config_out == NULL) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "config_out is NULL") + + plist_ptr = H5P_object_verify(fapl_id, H5P_FILE_ACCESS); + if (plist_ptr == NULL) { + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access list") + } + + config_ptr = (const H5FD_ioc_config_t *)H5P_peek_driver_info(plist_ptr); + if (config_ptr == NULL) { + memset(config_out, 0, sizeof(H5FD_ioc_config_t)); + ret_value = fapl_get_ioc_defaults(config_out); + } + else { + /* Copy the subfiling fapl data out */ + HDmemcpy(config_out, config_ptr, sizeof(H5FD_ioc_config_t)); + + /* Copy the driver info value */ + if (H5FD__copy_plist(config_ptr->common.ioc_fapl_id, &(config_out->common.ioc_fapl_id)) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't copy IOC FAPL"); + } + +done: + FUNC_LEAVE_API(ret_value) +} /* end H5Pget_fapl_ioc() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_flush + * + * Purpose: Flushes all data to disk for both channels. + * + * Return: SUCCEED/FAIL + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_flush(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t closing) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + /* Public API for dxpl "context" */ + if (H5FDflush(file->ioc_file, dxpl_id, closing) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTFLUSH, FAIL, "unable to flush R/W file") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_flush() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_read + * + * Purpose: Reads SIZE bytes of data from the R/W channel, beginning at + * address ADDR into buffer BUF according to data transfer + * properties in DXPL_ID. + * + * Return: Success: SUCCEED + * The read result is written into the BUF buffer + * which should be allocated by the caller. + * Failure: FAIL + * The contents of BUF are undefined. + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, + size_t size, void *buf) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + HDassert(file && file->pub.cls); + HDassert(buf); + + /* Check for overflow conditions */ + if (!H5F_addr_defined(addr)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined, addr = %llu", (unsigned long long)addr) + if (REGION_OVERFLOW(addr, size)) + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow, addr = %llu", (unsigned long long)addr) + + /* Public API for dxpl "context" */ + if (H5FDread(file->ioc_file, type, dxpl_id, addr, size, buf) < 0) + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "Reading from R/W channel failed") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_read() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_write + * + * Purpose: Writes SIZE bytes of data to R/W and W/O channels, beginning + * at address ADDR from buffer BUF according to data transfer + * properties in DXPL_ID. + * + * Return: SUCCEED/FAIL + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_write(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t dxpl_id, haddr_t addr, size_t size, + const void *buf) +{ + H5FD_ioc_t * file_ptr = (H5FD_ioc_t *)_file; + H5P_genplist_t *plist_ptr = NULL; + herr_t ret_value = SUCCEED; + hid_t h5_fid; + + FUNC_ENTER_STATIC + + if (NULL == (plist_ptr = (H5P_genplist_t *)H5I_object(dxpl_id))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a property list") + + addr += _file->base_addr; + h5_fid = (hid_t)file_ptr->inode; + ret_value = H5FD__write_vector_internal(h5_fid, 1, &addr, &size, &buf); + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_write() */ + +static herr_t +H5FD__ioc_read_vector(H5FD_t *_file, hid_t dxpl_id, uint32_t count, H5FD_mem_t types[], haddr_t addrs[], + size_t sizes[], void *bufs[] /* out */) +{ + H5FD_ioc_t *file_ptr = (H5FD_ioc_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + hid_t h5_fid; + + FUNC_ENTER_STATIC + + /* Check arguments */ + if (!file_ptr) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL") + + if ((!types) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "types parameter can't be NULL if count is positive") + + if ((!addrs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addrs parameter can't be NULL if count is positive") + + if ((!sizes) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sizes parameter can't be NULL if count is positive") + + if ((!bufs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs parameter can't be NULL if count is positive") + + /* Get the default dataset transfer property list if the user didn't provide + * one */ + if (H5P_DEFAULT == dxpl_id) { + dxpl_id = H5P_DATASET_XFER_DEFAULT; + } + else { + if (TRUE != H5P_isa_class(dxpl_id, H5P_DATASET_XFER)) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list") + } + + h5_fid = (hid_t)file_ptr->inode; + ret_value = H5FD__read_vector_internal(h5_fid, count, addrs, sizes, bufs); + +done: + FUNC_LEAVE_NOAPI(ret_value) +} + +static herr_t +H5FD__ioc_write_vector(H5FD_t *_file, hid_t dxpl_id, uint32_t count, H5FD_mem_t types[], haddr_t addrs[], + size_t sizes[], const void *bufs[] /* in */) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + hid_t h5_fid; + + FUNC_ENTER_STATIC + + /* Check arguments */ + if (!file) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL") + + if ((!types) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "types parameter can't be NULL if count is positive") + + if ((!addrs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addrs parameter can't be NULL if count is positive") + + if ((!sizes) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sizes parameter can't be NULL if count is positive") + + if ((!bufs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs parameter can't be NULL if count is positive") + + /* Get the default dataset transfer property list if the user didn't provide + * one */ + if (H5P_DEFAULT == dxpl_id) { + dxpl_id = H5P_DATASET_XFER_DEFAULT; + } + else { + if (TRUE != H5P_isa_class(dxpl_id, H5P_DATASET_XFER)) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list") + } + h5_fid = (hid_t)file->inode; + ret_value = H5FD__write_vector_internal(h5_fid, count, addrs, sizes, bufs); + +done: + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FDioc__write_vector() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_fapl_get + * + * Purpose: Returns a file access property list which indicates how the + * specified file is being accessed. The return list could be + * used to access another file the same way. + * + * Return: Success: Ptr to new file access property list with all + * members copied from the file struct. + * Failure: NULL + *------------------------------------------------------------------------- + */ +static void * +H5FD__ioc_fapl_get(H5FD_t *_file) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + void * ret_value = NULL; + + FUNC_ENTER_STATIC_NOERR + + H5FD_IOC_LOG_CALL(FUNC); + + ret_value = H5FD__ioc_fapl_copy(&(file->fa)); + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_fapl_get() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_fapl_copy + * + * Purpose: Copies the file access properties. + * + * Return: Success: Pointer to a new property list info structure. + * Failure: NULL + *------------------------------------------------------------------------- + */ +static void * +H5FD__ioc_fapl_copy(const void *_old_fa) +{ + const H5FD_ioc_config_t *old_fa_ptr = (const H5FD_ioc_config_t *)_old_fa; + H5FD_ioc_config_t * new_fa_ptr = NULL; + void * ret_value = NULL; + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + HDassert(old_fa_ptr); + + new_fa_ptr = H5FL_CALLOC(H5FD_ioc_config_t); + if (NULL == new_fa_ptr) + HGOTO_ERROR(H5E_VFL, H5E_CANTALLOC, NULL, "unable to allocate log file FAPL") + + HDmemcpy(new_fa_ptr, old_fa_ptr, sizeof(H5FD_ioc_config_t)); + HDstrncpy(new_fa_ptr->common.file_path, old_fa_ptr->common.file_path, H5FD_IOC_PATH_MAX); + + /* Copy the FAPL */ + if (H5FD__copy_plist(old_fa_ptr->common.ioc_fapl_id, &(new_fa_ptr->common.ioc_fapl_id)) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "can't copy the IOC FAPL"); + + ret_value = (void *)new_fa_ptr; + +done: + if (NULL == ret_value) + if (new_fa_ptr) + new_fa_ptr = H5FL_FREE(H5FD_ioc_config_t, new_fa_ptr); + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_fapl_copy() */ + +/*-------------------------------------------------------------------------- + * Function: H5FD__ioc_fapl_free + * + * Purpose: Releases the file access lists + * + * Return: SUCCEED/FAIL + *-------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_fapl_free(void *_fapl) +{ + H5FD_ioc_config_t *fapl = (H5FD_ioc_config_t *)_fapl; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + /* Check arguments */ + HDassert(fapl); + + if (H5I_dec_ref(fapl->common.ioc_fapl_id) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTDEC, FAIL, "can't close W/O FAPL ID") + + /* Free the property list */ + fapl = H5FL_FREE(H5FD_ioc_config_t, fapl); + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_fapl_free() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_open + * + * Purpose: Create and/or opens a file as an HDF5 file. + * + * Return: Success: A pointer to a new file data structure. The + * public fields will be initialized by the + * caller, which is always H5FD_open(). + * Failure: NULL + *------------------------------------------------------------------------- + */ +static H5FD_t * +H5FD__ioc_open(const char *name, unsigned flags, hid_t ioc_fapl_id, haddr_t maxaddr) +{ + H5FD_ioc_t * file_ptr = NULL; /* Ioc VFD info */ + const H5FD_ioc_config_t *fapl_ptr = NULL; /* Driver-specific property list */ + H5FD_class_t * driver = NULL; /* VFD for file */ + H5FD_driver_prop_t driver_prop; /* Property for driver ID & info */ + H5P_genplist_t * plist_ptr = NULL; + H5FD_t * ret_value = NULL; + int l_error = 0, g_error = 0, mpi_enabled = 0; + int mpi_code; /* MPI return code */ + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + +#if 0 /* JRM */ /* delete this eventually */ + HDfprintf(stdout, "\n\nH5FD__ioc_open: entering.\n\n"); + HDfflush(stdout); +#endif /* JRM */ /* delete this eventually */ + + /* Check arguments */ + if (!name || !*name) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, NULL, "invalid file name") + if (0 == maxaddr || HADDR_UNDEF == maxaddr) + HGOTO_ERROR(H5E_ARGS, H5E_BADRANGE, NULL, "bogus maxaddr") + if (ADDR_OVERFLOW(maxaddr)) + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, NULL, "bogus maxaddr") + if ((H5P_FILE_ACCESS_DEFAULT == ioc_fapl_id) || (H5FD_IOC != H5Pget_driver(ioc_fapl_id))) + /* presupposes that H5P_FILE_ACCESS_DEFAULT is not a ioc */ + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, NULL, "driver is not ioc") + + /* We should validate that the application has been initialized + * with MPI_Init_thread and that the library supports + * MPI_THREAD_MULTIPLE + */ + if (MPI_Initialized(&mpi_enabled) == MPI_SUCCESS) { + int mpi_provides = 0; + MPI_Query_thread(&mpi_provides); + if (mpi_provides != MPI_THREAD_MULTIPLE) { + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "Subfiling requires the use of MPI_THREAD_MULTIPLE") + } + } + + file_ptr = (H5FD_ioc_t *)H5FL_CALLOC(H5FD_ioc_t); + if (NULL == file_ptr) + HGOTO_ERROR(H5E_VFL, H5E_CANTALLOC, NULL, "unable to allocate file struct") + + /* Get some basic MPI information */ + MPI_Comm_size(MPI_COMM_WORLD, &file_ptr->mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &file_ptr->mpi_rank); + + /* Get the driver-specific file access properties */ + plist_ptr = (H5P_genplist_t *)H5I_object(ioc_fapl_id); + if (NULL == plist_ptr) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list") + + fapl_ptr = (const H5FD_ioc_config_t *)H5P_peek_driver_info(plist_ptr); + if (NULL == fapl_ptr) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "unable to get VFL driver info") + + /* Fill in the file config values */ + memcpy(&file_ptr->fa, fapl_ptr, sizeof(H5FD_ioc_config_t)); + + /* Extend the config info with file_path and file_dir */ + if (HDrealpath(name, file_ptr->fa.common.file_path) != NULL) { + char *path = HDstrdup(file_ptr->fa.common.file_path); + char *directory = dirname(path); + HDstrcpy(file_ptr->fa.common.file_dir, directory); + HDfree(path); + } + + /* Copy the ioc FAPL. */ + if (H5FD__copy_plist(fapl_ptr->common.ioc_fapl_id, &(file_ptr->fa.common.ioc_fapl_id)) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "can't copy W/O FAPL"); + + /* Check the "native" driver (sec2 or mpio) */ + plist_ptr = (H5P_genplist_t *)H5I_object(fapl_ptr->common.ioc_fapl_id); + + if (H5P_peek(plist_ptr, H5F_ACS_FILE_DRV_NAME, &driver_prop) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get driver ID & info") + if (NULL == (driver = (H5FD_class_t *)H5I_object(driver_prop.driver_id))) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "invalid driver ID in file access property list") + + if (strncmp(driver->name, "sec2", 4) == 0) { + uint64_t inode_id = (uint64_t)-1; + int ioc_flags = O_RDWR; + + /* Translate the HDF5 file open flags into standard POSIX open flags */ + if (flags & H5F_ACC_TRUNC) + ioc_flags |= O_TRUNC; + if (flags & H5F_ACC_CREAT) + ioc_flags |= O_CREAT; + + /* sec2 open the file */ + file_ptr->ioc_file = + H5FD_open(file_ptr->fa.common.file_path, flags, fapl_ptr->common.ioc_fapl_id, HADDR_UNDEF); + if (file_ptr->ioc_file) { + h5_stat_t sb; + H5FD_sec2_t *hdf_file = (H5FD_sec2_t *)file_ptr->ioc_file; + if (HDfstat(hdf_file->fd, &sb) < 0) + HSYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, NULL, "unable to fstat file") + /* Get the inode info and copy the open file descriptor + * The latter is used to pass to the subfiling code to use + * as an alternative to opening a new subfiling file, e.g. nnn_0_of_N.h5 + */ + file_ptr->inode = inode_id = sb.st_ino; + } + else { + /* The two-step file opening approach may be + * the root cause for the sec2 open to return a NULL. + * It is prudent then, to collectively fail (early) in this case. + */ + l_error = 1; + } + MPI_Allreduce(&l_error, &g_error, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + if (g_error) { + if (file_ptr->ioc_file) + H5FD_close(file_ptr->ioc_file); + HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file = %s\n", name) + } + + /* See: H5FDsubfile_int.c: returns error count! */ + if (H5FD__open_subfiles((void *)&file_ptr->fa, inode_id, ioc_flags) > 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open subfiling files = %s\n", name) + + else if (file_ptr->inode > 0) { /* No errors opening the subfiles */ + subfiling_context_t *sf_context = get__subfiling_object(file_ptr->fa.common.context_id); + if (sf_context && sf_context->topology->rank_is_ioc) { + if (initialize_ioc_threads(sf_context) < 0) { + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, NULL, "Unable to initialize IOC threads") + } + } + } + } + else { + HDputs("We only support sec2 file opens at the moment."); + HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file = %s\n", name) + } + + ret_value = (H5FD_t *)file_ptr; + +done: + if (NULL == ret_value) { + if (file_ptr) { + if (file_ptr->ioc_file) + H5FD_close(file_ptr->ioc_file); + H5FL_FREE(H5FD_ioc_t, file_ptr); + } + } /* end if error */ +#if 1 /* JRM */ + /* run a barrier just before exit. The objective is to + * ensure that the IOCs are fully up and running before + * we proceed. Note that this barrier is not sufficient + * by itself -- we also need code in initialize_ioc_threads() + * to wait until the main IOC thread has finished its + * initialization. + */ + /* TODO: don't use MPI_COMM_WORLD here -- use communicator supplied in the open instead */ + /* Adendum: Consider creating a copy of the supplied communicator for exclusing use by + * the VFD. I can't say that this is necessary, but it is a plausible cause + * of the hangs observed with sub-filing. -- JRM + */ + +#if 0 /* JRM */ /* remove eventually */ + HDfprintf(stdout, "\nH5FD__ioc_open: entering terminal barrier.\n"); + HDfflush(stdout); +#endif /* JRM */ /* remove eventually */ + + if ((mpi_code = MPI_Barrier(MPI_COMM_WORLD)) != MPI_SUCCESS) { + HMPI_DONE_ERROR(NULL, "Barrier failed", mpi_code) + } +#endif /* JRM */ +#if 0 /* JRM */ + HDfprintf(stdout, "\n\nH5FD__ioc_open: exiting.\n\n"); + HDfflush(stdout); +#endif /* JRM */ + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_open() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_close + * + * Purpose: Closes files + * + * Return: Success: SUCCEED + * Failure: FAIL, file not closed. + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_close(H5FD_t *_file) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + herr_t ret_value = SUCCEED; + // subfiling_context_t *sf_context = NULL; + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + /* Sanity check */ + HDassert(file); +#ifdef VERBOSE + sf_context = (subfiling_context_t *)get__subfiling_object(file->fa.common.context_id); + if (sf_context->topology->rank_is_ioc) + printf("[%s %d] fd=%d\n", __func__, file->mpi_rank, sf_context->sf_fid); + else + printf("[%s %d] fd=*\n", __func__, file->mpi_rank); + fflush(stdout); +#endif + + if (H5I_dec_ref(file->fa.common.ioc_fapl_id) < 0) + HGOTO_ERROR(H5E_VFL, H5E_ARGS, FAIL, "can't close W/O FAPL") + + /* Call the sec2 close */ + if (file->ioc_file) { + if (H5FD_close(file->ioc_file) == FAIL) + HGOTO_ERROR(H5E_VFL, H5E_CANTCLOSEFILE, FAIL, "unable to close HDF5 file") + } + + /* See: H5FDsubfile_int.c */ + if (H5FD__close_subfiles(file->fa.common.context_id) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTCLOSEFILE, FAIL, "unable to close subfiling file(s)") + + /* dup'ed in the H5FD__ioc_open function (see above) */ + HDclose(file->hdf_fd_dup); + /* Release the file info */ + file = H5FL_FREE(H5FD_ioc_t, file); + file = NULL; + +done: + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_close() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_get_eoa + * + * Purpose: Returns the end-of-address marker for the file. The EOA + * marker is the first address past the last byte allocated in + * the format address space. + * + * Return: Success: The end-of-address-marker + * + * Failure: HADDR_UNDEF + *------------------------------------------------------------------------- + */ +static haddr_t +H5FD__ioc_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type) +{ + const H5FD_ioc_t *file = (const H5FD_ioc_t *)_file; + haddr_t ret_value = HADDR_UNDEF; + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + /* Sanity check */ + HDassert(file); + HDassert(file->ioc_file); + + if ((ret_value = H5FD_get_eoa(file->ioc_file, type)) == HADDR_UNDEF) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, HADDR_UNDEF, "unable to get eoa") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_get_eoa */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_set_eoa + * + * Purpose: Set the end-of-address marker for the file. This function is + * called shortly after an existing HDF5 file is opened in order + * to tell the driver where the end of the HDF5 data is located. + * + * Return: SUCCEED/FAIL + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC) + + /* Sanity check */ + HDassert(file); + HDassert(file->ioc_file); + HDassert(file->ioc_file); + + if (H5FD_set_eoa(file->ioc_file, type, addr) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTSET, FAIL, "H5FDset_eoa failed for R/W file") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_set_eoa() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_get_eof + * + * Purpose: Returns the end-of-address marker for the file. The EOA + * marker is the first address past the last byte allocated in + * the format address space. + * + * Return: Success: The end-of-address-marker + * + * Failure: HADDR_UNDEF + *------------------------------------------------------------------------- + */ +static haddr_t +H5FD__ioc_get_eof(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type) +{ + const H5FD_ioc_t * file = (const H5FD_ioc_t *)_file; + haddr_t ret_value = HADDR_UNDEF; /* Return value */ + subfiling_context_t *sf_context = NULL; + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + /* Sanity check */ + HDassert(file); + HDassert(file->ioc_file); + + sf_context = get__subfiling_object(file->fa.common.context_id); + if (sf_context) { + ret_value = (haddr_t)sf_context->sf_eof; + goto done; + } + + if (HADDR_UNDEF == (ret_value = H5FD_get_eof(file->ioc_file, type))) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, HADDR_UNDEF, "unable to get eof") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_get_eof */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_truncate + * + * Purpose: Notify driver to truncate the file back to the allocated size. + * + * Return: SUCCEED/FAIL + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + HDassert(file); + HDassert(file->ioc_file); + HDassert(file->ioc_file); + + if (H5FDtruncate(file->ioc_file, dxpl_id, closing) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "unable to truncate R/W file") +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_truncate */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_sb_size + * + * Purpose: Obtains the number of bytes required to store the driver file + * access data in the HDF5 superblock. + * + * Return: Success: Number of bytes required. + * + * Failure: 0 if an error occurs or if the driver has no + * data to store in the superblock. + * + * NOTE: no public API for H5FD_sb_size, it needs to be added + *------------------------------------------------------------------------- + */ +static hsize_t +H5FD__ioc_sb_size(H5FD_t *_file) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + hsize_t ret_value = 0; + + FUNC_ENTER_STATIC_NOERR + + H5FD_IOC_LOG_CALL(FUNC); + + /* Sanity check */ + HDassert(file); + HDassert(file->ioc_file); + + if (file->ioc_file) + ret_value = H5FD_sb_size(file->ioc_file); + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_sb_size */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_sb_encode + * + * Purpose: Encode driver-specific data into the output arguments. + * + * Return: SUCCEED/FAIL + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_sb_encode(H5FD_t *_file, char *name /*out*/, unsigned char *buf /*out*/) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + /* Sanity check */ + HDassert(file); + HDassert(file->ioc_file); + + if (file->ioc_file && H5FD_sb_encode(file->ioc_file, name, buf) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTENCODE, FAIL, "unable to encode the superblock in R/W file") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_sb_encode */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_sb_decode + * + * Purpose: Decodes the driver information block. + * + * Return: SUCCEED/FAIL + * + * NOTE: no public API for H5FD_sb_size, need to add + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_sb_decode(H5FD_t *_file, const char *name, const unsigned char *buf) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + /* Sanity check */ + HDassert(file); + HDassert(file->ioc_file); + + if (H5FD_sb_load(file->ioc_file, name, buf) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTDECODE, FAIL, "unable to decode the superblock in R/W file") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_sb_decode */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_cmp + * + * Purpose: Compare the keys of two files. + * + * Return: Success: A value like strcmp() + * Failure: Must never fail + *------------------------------------------------------------------------- + */ +static int +H5FD__ioc_cmp(const H5FD_t *_f1, const H5FD_t *_f2) +{ + const H5FD_ioc_t *f1 = (const H5FD_ioc_t *)_f1; + const H5FD_ioc_t *f2 = (const H5FD_ioc_t *)_f2; + herr_t ret_value = 0; /* Return value */ + + FUNC_ENTER_STATIC_NOERR + + H5FD_IOC_LOG_CALL(FUNC); + + HDassert(f1); + HDassert(f2); + + ret_value = H5FD_cmp(f1->ioc_file, f2->ioc_file); + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_cmp */ + +/*-------------------------------------------------------------------------- + * Function: H5FD__ioc_get_handle + * + * Purpose: Returns a pointer to the file handle of low-level virtual + * file driver. + * + * Return: SUCCEED/FAIL + *-------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_get_handle(H5FD_t *_file, hid_t H5_ATTR_UNUSED fapl, void **file_handle) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + /* Check arguments */ + HDassert(file); + HDassert(file->ioc_file); + HDassert(file_handle); + + if (H5FD_get_vfd_handle(file->ioc_file, file->fa.common.ioc_fapl_id, file_handle) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "unable to get handle of R/W file") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_get_handle */ + +/*-------------------------------------------------------------------------- + * Function: H5FD__ioc_lock + * + * Purpose: Sets a file lock. + * + * Return: SUCCEED/FAIL + *-------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_lock(H5FD_t *_file, hbool_t H5_ATTR_UNUSED rw) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; /* VFD file struct */ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + HDassert(file); + HDassert(file->ioc_file); + +#if 1 + if (HDflock(file->hdf_fd_dup, LOCK_SH) < 0) { + perror("flock"); + HGOTO_ERROR(H5E_VFL, H5E_CANTLOCKFILE, FAIL, "unable to lock R/W file") + } +#else + /* Place the lock on each file */ + if (H5FD_lock(file->ioc_file, rw) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTLOCKFILE, FAIL, "unable to lock R/W file") +#endif + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_lock */ + +/*-------------------------------------------------------------------------- + * Function: H5FD__ioc_unlock + * + * Purpose: Removes a file lock. + * + * Return: SUCCEED/FAIL + *-------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_unlock(H5FD_t *_file) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; /* VFD file struct */ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + /* Check arguments */ + HDassert(file); +#if 1 + if (HDflock(file->hdf_fd_dup, LOCK_UN) < 0) { + perror("flock"); + HGOTO_ERROR(H5E_VFL, H5E_CANTLOCKFILE, FAIL, "unable to lock R/W file") + } +#else + if (file->ioc_file != NULL) + if (H5FD_unlock(file->ioc_file) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTUNLOCKFILE, FAIL, "unable to unlock W/O file") +#endif + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_unlock */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_query + * + * Purpose: Set the flags that this VFL driver is capable of supporting. + * (listed in H5FDpublic.h) + * + * Return: SUCCEED/FAIL + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_query(const H5FD_t *_file, unsigned long *flags /* out */) +{ + const H5FD_ioc_t *file_ptr = (const H5FD_ioc_t *)_file; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + if (file_ptr == NULL) { + if (flags) + *flags = 0; + } + else if (file_ptr->ioc_file) { + if (H5FDquery(file_ptr->ioc_file, flags) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTLOCK, FAIL, "unable to query R/W file"); + } + else { + /* There is no file. Because this is a pure passthrough VFD, + * it has no features of its own. + */ + if (flags) + *flags = 0; + } + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_query() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_alloc + * + * Purpose: Allocate file memory. + * + * Return: Address of allocated space (HADDR_UNDEF if error). + *------------------------------------------------------------------------- + */ +static haddr_t +H5FD__ioc_alloc(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, hsize_t size) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; /* VFD file struct */ + haddr_t ret_value = HADDR_UNDEF; /* Return value */ + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + /* Check arguments */ + HDassert(file); + HDassert(file->ioc_file); + + /* Allocate memory for each file, only return the return value for R/W file. + */ + if ((ret_value = H5FDalloc(file->ioc_file, type, dxpl_id, size)) == HADDR_UNDEF) + HGOTO_ERROR(H5E_VFL, H5E_CANTINIT, HADDR_UNDEF, "unable to allocate for R/W file") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_alloc() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_get_type_map + * + * Purpose: Retrieve the memory type mapping for this file + * + * Return: SUCCEED/FAIL + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_get_type_map(const H5FD_t *_file, H5FD_mem_t *type_map) +{ + const H5FD_ioc_t *file = (const H5FD_ioc_t *)_file; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + /* Check arguments */ + HDassert(file); + HDassert(file->ioc_file); + + /* Retrieve memory type mapping for R/W channel only */ + if (H5FD_get_fs_type_map(file->ioc_file, type_map) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "unable to allocate for R/W file") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_get_type_map() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__ioc_free + * + * Purpose: Free the resources for the ioc VFD. + * + * Return: SUCCEED/FAIL + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__ioc_free(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, hsize_t size) +{ + H5FD_ioc_t *file = (H5FD_ioc_t *)_file; /* VFD file struct */ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + H5FD_IOC_LOG_CALL(FUNC); + + /* Check arguments */ + HDassert(file); + HDassert(file->ioc_file); + + if (H5FDfree(file->ioc_file, type, dxpl_id, addr, size) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free for R/W file") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__ioc_free() */ + +void +H5FD_ioc_wait_thread_main(void) +{ + return; +} + +void +H5FD_ioc_finalize_threads(void) +{ + + return; +} diff --git a/src/H5FDioc.h b/src/H5FDioc.h new file mode 100644 index 00000000000..f9f32eb4d47 --- /dev/null +++ b/src/H5FDioc.h @@ -0,0 +1,143 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Purpose: The public header file for the "io concentrator" driver. + * This provides a similar functionality to that of the subfiling driver + * but introduces the necessary file access functionality via a multi- + * threading MPI service + */ + +#ifndef H5FDioc_H +#define H5FDioc_H + +#define H5FD_IOC (H5FD_ioc_init()) +#define H5FD_IOC_VALUE H5_VFD_IOC + +#ifndef H5FD_IOC_FAPL_T_MAGIC +#define H5FD_CURR_IOC_FAPL_T_VERSION 1 +#define H5FD_IOC_FAPL_T_MAGIC 0xFED21331 +#endif + +/* Maximum length of a filename/path string in the Write-Only channel, + * including the NULL-terminator. + */ +#define H5FD_IOC_PATH_MAX 4096 +#define H5FD_IOC_THREAD_POOL_SIZE 4 + +/* + * Define the various constants to allow different allocations + * of subfile ranks. The choices are self explanatory, starting + * with the default of one IO Concentrator (IOC) per node and + * lastly, defining a fixed number. + */ +typedef enum { + SELECT_IOC_ONE_PER_NODE = 0, /* Default */ + SELECT_IOC_EVERY_NTH_RANK, /* Starting at rank 0, select-next += N */ + SELECT_IOC_WITH_CONFIG, /* NOT IMPLEMENTED: Read-from-file */ + SELECT_IOC_TOTAL, /* Starting at rank 0, mpi_size / total */ + ioc_selection_options /* (Uses same selection as every Nth rank) */ +} ioc_selection_t; + +/* + * In addition to the common configuration fields, we can have + * VFD specific fields. Here's one for the IO Concentrator VFD. + * + * thread_pool_count (int32_t) + * Indicate the number of helper threads that we want for + * creating a thread pool + * + * ---------------------------------------------------------------------------- + */ +#define H5FD_SUBFILING_PATH_MAX 4096 + +typedef struct config_common_t { + uint32_t magic; /* set to H5FD_SUBFILING_FAPL_T_MAGIC */ + uint32_t version; /* set to H5FD_CURR_SUBFILING_FAPL_T_VERSION */ + int32_t stripe_count; /* How many io concentrators */ + int64_t stripe_depth; /* Max # of bytes in contiguous IO to an IOC */ + ioc_selection_t ioc_selection; /* Method to select IO Concentrators */ + hid_t ioc_fapl_id; /* The hid_t value of the stacked VFD */ + int64_t context_id; /* The value used to lookup an IOC context */ + char file_dir[H5FD_SUBFILING_PATH_MAX + 1]; /* Directory where we find files */ + char file_path[H5FD_SUBFILING_PATH_MAX + 1]; /* The user defined filename */ +} config_common_t; + +typedef struct H5FD_ioc_config_t { + config_common_t common; + int32_t thread_pool_count; +} H5FD_ioc_config_t; + +/* The information of this ioc */ +typedef struct H5FD_ioc_t { + H5FD_t pub; /* public stuff, must be first */ + int fd; /* the filesystem file descriptor */ + + H5FD_ioc_config_t fa; /* driver-specific file access properties */ + int mpi_rank; + int mpi_size; + H5FD_t * ioc_file; /* native HDF5 file pointer (sec2) */ + +#ifndef H5_HAVE_WIN32_API + /* On most systems the combination of device and i-node number uniquely + * identify a file. Note that Cygwin, MinGW and other Windows POSIX + * environments have the stat function (which fakes inodes) + * and will use the 'device + inodes' scheme as opposed to the + * Windows code further below. + */ + dev_t device; /* file device number */ + ino_t inode; /* file i-node number */ +#else + /* Files in windows are uniquely identified by the volume serial + * number and the file index (both low and high parts). + * + * There are caveats where these numbers can change, especially + * on FAT file systems. On NTFS, however, a file should keep + * those numbers the same until renamed or deleted (though you + * can use ReplaceFile() on NTFS to keep the numbers the same + * while renaming). + * + * See the MSDN "BY_HANDLE_FILE_INFORMATION Structure" entry for + * more information. + * + * http://msdn.microsoft.com/en-us/library/aa363788(v=VS.85).aspx + */ + DWORD nFileIndexLow; + DWORD nFileIndexHigh; + DWORD dwVolumeSerialNumber; + + HANDLE hFile; /* Native windows file handle */ +#endif /* H5_HAVE_WIN32_API */ + int hdf_fd_dup; +} H5FD_ioc_t; + +#ifdef __cplusplus +extern "C" { +#endif +H5_DLL hid_t H5FD_ioc_init(void); +H5_DLL herr_t H5Pset_fapl_ioc(hid_t fapl_id, H5FD_ioc_config_t *config_ptr); +H5_DLL herr_t H5Pget_fapl_ioc(hid_t fapl_id, H5FD_ioc_config_t *config_ptr); +H5_DLL void H5FD_ioc_set_shutdown_flag(int flag); +H5_DLL void H5FD_ioc_wait_thread_main(void); +H5_DLL void H5FD_ioc_finalize_threads(void); +H5_DLL int initialize_ioc_threads(void *_sf_context); +H5_DLL int tpool_add_work(void *work); +H5_DLL void begin_thread_exclusive(void); +H5_DLL void end_thread_exclusive(void); +H5_DLL void ioc__wait_for_serialize(void *msg); +H5_DLL void ioc__release_dependency(int qid); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/H5FDioc_threads.c b/src/H5FDioc_threads.c new file mode 100644 index 00000000000..a8b4f9cfe3e --- /dev/null +++ b/src/H5FDioc_threads.c @@ -0,0 +1,1208 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include "H5FDsubfiling.h" +#include "mercury_thread.h" +#include "mercury_thread_mutex.h" +#include "mercury_thread_pool.h" + +/* + * NOTES: + * Rather than re-create the code for creating and managing a thread pool, + * I'm utilizing a reasonably well tested implementation from the mercury + * project. At some point, we should revisit this decision or possibly + * directly link against the mercury library. This would make sense if + * we move away from using MPI as the messaging infrastructure and instead + * use mercury for that purpose... + */ + +static hg_thread_mutex_t ioc_mutex = PTHREAD_MUTEX_INITIALIZER; +static hg_thread_mutex_t ioc_thread_mutex = PTHREAD_MUTEX_INITIALIZER; +static hg_thread_mutex_t ioc_serialize_mutex = PTHREAD_MUTEX_INITIALIZER; +static hg_thread_pool_t *ioc_thread_pool = NULL; +static hg_thread_t ioc_thread; + +#ifndef HG_TEST_NUM_THREADS_DEFAULT +#define HG_TEST_NUM_THREADS_DEFAULT 4 +#endif + +extern int ioc_main(int64_t context_id); + +static int pool_concurrent_max = 0; +static struct hg_thread_work *pool_request = NULL; + +/* Prototypes */ +void __attribute__((destructor)) finalize_ioc_threads(void); +int wait_for_thread_main(void); +bool tpool_is_empty(void); + +#if 1 /* JRM */ + +extern H5FD_ioc_io_queue_t io_queue_g; + +#endif /* JRM */ + +/*------------------------------------------------------------------------- + * Function: local ioc_thread_main + * + * Purpose: An IO Concentrator instance is initialized with the + * specified subfiling context. + * + * Return: The IO concentrator thread executes as long as the HDF5 + * file associated with this context is open. At file close, + * the thread will return from 'ioc_main' and the thread + * exit status will be checked by the main program. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static HG_THREAD_RETURN_TYPE +ioc_thread_main(void *arg) +{ + int64_t context_id = *(int64_t *)arg; + hg_thread_ret_t thread_ret = (hg_thread_ret_t)0; + + /* Pass along the subfiling_context_t */ + ioc_main(context_id); + + HDfree(arg); + return thread_ret; +} + +/*------------------------------------------------------------------------- + * Function: initialize_ioc_threads + * + * Purpose: The principal entry point to initialize the execution + * context for an IO Concentrator (IOC). The main thread + * is responsible for receiving IO requests from each + * HDF5 "client" and distributing those to helper threads + * for actual processing. We initialize a fixed number + * of helper threads by creating a thread_pool. + * + * Return: SUCCESS (0) or FAIL (-1) if any errors are detected + * for the multi-threaded initialization. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +initialize_ioc_threads(void *_sf_context) +{ + int status; + int file_open_count; + subfiling_context_t *sf_context = _sf_context; + unsigned int thread_pool_count = HG_TEST_NUM_THREADS_DEFAULT; + int64_t * context_id = (int64_t *)HDmalloc(sizeof(int64_t)); + int world_size = sf_context->topology->app_layout->world_size; + size_t alloc_size = ((size_t)world_size * sizeof(struct hg_thread_work)); + char * envValue; + double t_start = 0.0, t_end = 0.0; + +#if 0 /* JRM */ /* delete this evenutually */ + HDprintf("\nworld_size = %d\n", world_size); +#endif /* JRM */ + +#if 1 /* JRM */ /* try doubling the size of the pool_request array */ + world_size *= 4; + alloc_size *= 4; +#endif /* JRM */ + + assert(context_id != NULL); + + file_open_count = atomic_load(&sf_file_open_count); + atomic_fetch_add(&sf_file_open_count, 1); + + if (file_open_count > 0) + return 0; + + t_start = MPI_Wtime(); + + /* Initialize the main IOC thread input argument. + * Each IOC request will utilize this context_id which is + * consistent across all MPI ranks, to ensure that requests + * involving reference counting are correctly using the + * correct file contexts. + */ + context_id[0] = sf_context->sf_context_id; + + if (pool_request == NULL) { + if ((pool_request = (struct hg_thread_work *)malloc(alloc_size)) == NULL) { + perror("malloc error"); + return -1; + } + else + pool_concurrent_max = world_size; + } + + memset(pool_request, 0, alloc_size); + + /* Initialize a couple of mutex variables that are used + * during IO concentrator operations to serialize + * access to key objects, e.g. reference counting. + */ + status = hg_thread_mutex_init(&ioc_mutex); + if (status) { + puts("hg_thread_mutex_init failed"); + goto err_exit; + } + status = hg_thread_mutex_init(&ioc_thread_mutex); + if (status) { + puts("hg_thread_mutex_init failed"); + goto err_exit; + } + +#if 1 /* JRM */ /* needed for new dispatch code */ + + status = hg_thread_mutex_init(&(io_queue_g.q_mutex)); + if (status) { + puts("hg_thread_mutex_init failed for io_queue_g.q_mutex"); + goto err_exit; + } + +#endif /* JRM */ + + /* Allow experimentation with the number of helper threads */ + if ((envValue = getenv("IOC_THREAD_POOL_COUNT")) != NULL) { + int value_check = atoi(envValue); + if (value_check > 0) { + thread_pool_count = (unsigned int)value_check; + } + } + + /* Initialize a thread pool for the IO Concentrator to use */ + status = hg_thread_pool_init(thread_pool_count, &ioc_thread_pool); + if (status) { + puts("hg_thread_pool_init failed"); + goto err_exit; + } + + /* Arguments to hg_thread_create are: + * 1. A pointer to reference the created thread. + * 2. User function pointer for the new thread to execute. + * 3. Pointer to the input argument that gets passed along to the user + * function. + */ + atomic_init(&sf_ioc_ready, 0); + status = hg_thread_create(&ioc_thread, ioc_thread_main, (void *)context_id); + if (status) { + puts("hg_thread_create failed"); + goto err_exit; + } + else { /* wait until ioc_main() reports that it is ready */ + while (atomic_load(&sf_ioc_ready) != 1) { + + usleep(20); + } + } + +#ifndef NDEBUG + t_end = MPI_Wtime(); + if (sf_verbose_flag) { + if (sf_context->topology->subfile_rank == 0) { + HDprintf("%s: time = %lf seconds\n", __func__, (t_end - t_start)); + HDfflush(stdout); + } + } +#endif + return 0; + +err_exit: + return -1; +} + +/*------------------------------------------------------------------------- + * Function: finalize_ioc_threads + * + * Purpose: Normally we shouldn't have any IOC threads running by the + * program exits. If we do, this destructor function gets + * called to cleanup + * + * Return: None + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +void __attribute__((destructor)) finalize_ioc_threads(void) +{ + if (ioc_thread_pool != NULL) { + hg_thread_pool_destroy(ioc_thread_pool); + ioc_thread_pool = NULL; + } +} + +static const char * +translate_opcode(io_op_t op) +{ + switch (op) { + case READ_OP: + return "READ_OP"; + break; + case WRITE_OP: + return "WRITE_OP"; + break; + case OPEN_OP: + return "OPEN_OP"; + break; + case CLOSE_OP: + return "CLOSE_OP"; + break; + case TRUNC_OP: + return "TRUNC_OP"; + break; + case GET_EOF_OP: + return "GET_EOF_OP"; + break; + case FINI_OP: + return "FINI_OP"; + break; + case LOGGING_OP: + return "LOGGING_OP"; + break; + } + return "unknown"; +} +/*------------------------------------------------------------------------- + * Function: local: handle_work_request + * + * Purpose: Handle a work request from the thread pool work queue. + * We dispatch the specific function as indicated by the + * TAG that has been added to the work request by the + * IOC main thread (which is just a copy of the MPI tag + * associated with the RPC message) and provide the subfiling + * context associated with the HDF5 file. + * + * Any status associated with the function processing is + * returned directly to the client via ACK or NACK messages. + * + * Return: (none) Doesn't fail. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +#if 0 /* JRM */ /* Original version -- expects sf_work_request_t * as its argument */ +static HG_THREAD_RETURN_TYPE +handle_work_request(void *arg) +{ +#if 1 /* JRM */ + int curr_io_ops_pending; +#endif /* JRM */ + int status = 0; + hg_thread_ret_t ret = 0; + sf_work_request_t * msg = (sf_work_request_t *)arg; + int64_t file_context_id = msg->header[2]; + subfiling_context_t *sf_context = NULL; + + sf_context = get__subfiling_object(file_context_id); + assert(sf_context != NULL); + +#if 0 /* JRM */ + HDfprintf(stdout, "\nhandle_work_request: context_id = %lld, msg->tag = %d\n", (long long)(file_context_id), (int)(msg->tag)); + HDfflush(stdout); +#endif /* JRM */ + + atomic_fetch_add(&sf_work_pending, 1); // atomic + msg->in_progress = 1; + switch (msg->tag) { + case WRITE_INDEP: + status = queue_write_indep(msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm); + break; + + case READ_INDEP: + if (msg->serialize) + ioc__wait_for_serialize(arg); // wait for dependency + status = queue_read_indep(msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm); + break; + + default: + HDprintf("[ioc(%d)] received message tag(%x)from rank %d\n", msg->subfile_rank, msg->tag, + msg->source); + status = -1; + break; + } + fflush(stdout); + +#if 1 /* JRM */ + curr_io_ops_pending = atomic_fetch_sub(&sf_io_ops_pending, 1); + HDassert(curr_io_ops_pending > 0); +#endif /* JRM */ + + atomic_fetch_sub(&sf_work_pending, 1); // atomic + msg->in_progress = 0; + if (msg->dependents) { + ioc__release_dependency(msg->depend_id); + msg->dependents = 0; + } + if (status < 0) { + HDprintf("[ioc(%d) %s]: request(%s) filename=%s from " + "rank(%d), size=%ld, offset=%ld FAILED\n", + msg->subfile_rank, __func__, translate_opcode((io_op_t)msg->tag), sf_context->sf_filename, + msg->source, msg->header[0], msg->header[1]); + + fflush(stdout); + } + return ret; +} + +#else /* JRM */ /* Modified version -- expects H5FD_ioc_io_queue_entry_t * as its argument */ + +static HG_THREAD_RETURN_TYPE +handle_work_request(void *arg) +{ +#if 1 /* JRM */ + int curr_io_ops_pending; +#endif /* JRM */ + int status = 0; + hg_thread_ret_t ret = 0; + H5FD_ioc_io_queue_entry_t *q_entry_ptr = (H5FD_ioc_io_queue_entry_t *)arg; + sf_work_request_t * msg = &(q_entry_ptr->wk_req); + int64_t file_context_id = msg->header[2]; + subfiling_context_t * sf_context = NULL; + + HDassert(q_entry_ptr); + HDassert(q_entry_ptr->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC); + HDassert(q_entry_ptr->in_progress); + + sf_context = get__subfiling_object(file_context_id); + assert(sf_context != NULL); + +#if 1 /* JRM */ + atomic_fetch_add(&sf_work_pending, 1); // atomic +#endif /* JRM */ + msg->in_progress = 1; +#if 0 /* JRM */ + HDfprintf(stdout, "\n\nhandle_work_request: beginning execution of request %d. op = %d, offset/len = %lld/%lld.\n", + q_entry_ptr->counter, (msg->tag), (long long)(msg->header[1]), (long long)(msg->header[0])); + HDfflush(stdout); +#endif /* JRM */ + switch (msg->tag) { + case WRITE_INDEP: + status = queue_write_indep(msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm, + q_entry_ptr->counter); + break; + + case READ_INDEP: + status = queue_read_indep(msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm); + break; + + case TRUNC_OP: + status = sf_truncate(sf_context->sf_fid, q_entry_ptr->wk_req.header[0], + sf_context->topology->subfile_rank); + break; + + case GET_EOF_OP: + /* Use of data comm to return EOF to the requesting rank seems a bit odd, but follow existing + * convention for now. + */ + status = report_sf_eof(msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm); + break; + + default: + HDprintf("[ioc(%d)] received message tag(%x)from rank %d\n", msg->subfile_rank, msg->tag, + msg->source); + status = -1; + break; + } + fflush(stdout); + + atomic_fetch_sub(&sf_work_pending, 1); // atomic + + if (status < 0) { + HDprintf("[ioc(%d) %s]: request(%s) filename=%s from " + "rank(%d), size=%ld, offset=%ld FAILED\n", + msg->subfile_rank, __func__, translate_opcode((io_op_t)msg->tag), sf_context->sf_filename, + msg->source, msg->header[0], msg->header[1]); + + fflush(stdout); + } + +#if 1 /* JRM */ + curr_io_ops_pending = atomic_load(&sf_io_ops_pending); + if (curr_io_ops_pending <= 0) { + + HDprintf("\n\nhandle_work_request: curr_io_ops_pending = %d, op = %d, offset/len = %lld/%lld.\n\n", + curr_io_ops_pending, (msg->tag), (long long)(msg->header[1]), (long long)(msg->header[0])); + HDfflush(stdout); + } + + HDassert(curr_io_ops_pending > 0); +#endif /* JRM */ + + /* complete the I/O request */ + H5FD_ioc__complete_io_q_entry(q_entry_ptr); + + HDassert(atomic_load(&sf_io_ops_pending) >= 0); + + /* Check the I/O Queue to see if there are any dispatchable entries */ + H5FD_ioc__dispatch_elegible_io_q_entries(); + + return ret; +} + +#endif /* JRM */ /* Modified version -- expects H5FD_ioc_io_queue_entry_t * as its argument */ + +void +ioc__wait_for_serialize(void *_work) +{ + sf_work_request_t *work = (sf_work_request_t *)_work; + volatile int waiting = 1; + while (waiting) { + usleep(5); + hg_thread_mutex_lock(&ioc_serialize_mutex); + waiting = work->serialize; + hg_thread_mutex_unlock(&ioc_serialize_mutex); + } +} + +void +ioc__release_dependency(int qid) +{ + sf_work_request_t *work = (sf_work_request_t *)pool_request[qid].args; + hg_thread_mutex_lock(&ioc_serialize_mutex); + work->serialize = 0; + hg_thread_mutex_unlock(&ioc_serialize_mutex); +} + +static int +check__overlap(void *_work, int current_index, int *conflict_id) +{ + sf_work_request_t *work = (sf_work_request_t *)_work; + sf_work_request_t *next = NULL; + int index, count = 0; + /* Search backward thru the queue of work requests */ + + for (index = current_index; count < pool_concurrent_max; count++, index--) { + if (index == 0) { + index = pool_concurrent_max - 1; + } + if (index == current_index) + return 0; + if ((next = (sf_work_request_t *)(pool_request[index].args)) == NULL) + continue; + /* The queued operation need NOT be running at present... */ + else /* if (next->in_progress) */ { + if (work->tag == WRITE_INDEP) { + /* a WRITE should not overlap with anything else */ + int64_t n_data_size = next->header[0]; + int64_t n_offset = next->header[1]; + int64_t n_max_offset = (n_offset + n_data_size) - 1; + int64_t w_data_size = work->header[0]; + int64_t w_offset = work->header[1]; + int64_t w_max_offset = (w_offset + w_data_size) - 1; + if ((w_max_offset >= n_offset) && (w_max_offset < n_max_offset)) { + next->dependents = 1; + next->depend_id = current_index; + work->serialize = true; + *conflict_id = index; + return 1; + } + else if ((w_offset <= n_max_offset) && (w_offset > n_offset)) { + next->dependents = 1; + next->depend_id = current_index; + work->serialize = true; + *conflict_id = index; + return 1; + } + } + /* The work->tag indicates READ, so only check for a conflicting WRITE */ + else if (next->tag == WRITE_INDEP) { + int64_t n_data_size = next->header[0]; + int64_t n_offset = next->header[1]; + int64_t n_max_offset = (n_offset + n_data_size) - 1; + int64_t w_data_size = work->header[0]; + int64_t w_offset = work->header[1]; + int64_t w_max_offset = (w_offset + w_data_size) - 1; + if ((w_max_offset >= n_offset) && (w_max_offset < n_max_offset)) { + next->dependents = 1; + next->depend_id = current_index; + work->serialize = true; + *conflict_id = index; + return 1; + } + else if ((w_offset <= n_max_offset) && (w_offset > n_offset)) { + next->dependents = 1; + next->depend_id = current_index; + work->serialize = true; + *conflict_id = index; + return 1; + } + } + } + } + return 0; +} + +/*------------------------------------------------------------------------- + * Function: tpool_add_work + * + * Purpose: Initiate the handoff of client request processing to a + * thread in the thread pool. A work request is created and + * added to the thread pool work queue. Once + * + * Return: result of: (hostid1 > hostid2) + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +tpool_add_work(void *_work) +{ +#if 1 /* JRM */ + int curr_io_ops_pending; +#endif /* JRM */ + static int work_index = 0; + int conflict_id = -1; + sf_work_request_t *work = (sf_work_request_t *)_work; + /* We have yet to start processing this new request... */ + work->in_progress = 0; + hg_thread_mutex_lock(&ioc_mutex); + if (check__overlap(_work, work_index, &conflict_id) > 0) { +#ifdef VERBOSE + const char * type = (work->tag == WRITE_INDEP ? "WRITE" : "READ"); + sf_work_request_t *next = (sf_work_request_t *)(pool_request[conflict_id].args); + printf("%s - (%d) Found conflict: index=%d: work(offset=%ld,length=%ld) conflict(offset=%ld, " + "length=%ld)\n", + type, work_index, conflict_id, work->header[1], work->header[0], next->header[1], + next->header[0]); + fflush(stdout); +#endif + } + + if (work_index == pool_concurrent_max) + work_index = 0; + + pool_request[work_index].func = handle_work_request; + pool_request[work_index].args = work; +#if 1 /* JRM */ + curr_io_ops_pending = atomic_fetch_add(&sf_io_ops_pending, 1); + + HDassert(curr_io_ops_pending >= 0); + + if (curr_io_ops_pending >= pool_concurrent_max) { + + HDfprintf(stderr, "\n\n*** curr_io_ops_pending = %d >= pool_concurrent_max = %d ***\n\n", + curr_io_ops_pending, pool_concurrent_max); + } +#endif /* JRM */ + hg_thread_pool_post(ioc_thread_pool, &pool_request[work_index++]); + hg_thread_mutex_unlock(&ioc_mutex); + return 0; +} + +/*------------------------------------------------------------------------- + * Function: tpool_is_empty + * + * Purpose: Utility function to indicate to the caller whether there + * is any remaining work in the thread pool queue. + * + * Return: TRUE or FALSE to indicate whether the work queue is empty. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +bool +tpool_is_empty(void) +{ + return HG_QUEUE_IS_EMPTY(&ioc_thread_pool->queue); +} + +/*------------------------------------------------------------------------- + * Function: begin_thread_exclusive + * + * Purpose: Mutex lock to restrict access to code or variables. + * + * Return: integer result of mutex_lock request. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +void +begin_thread_exclusive(void) +{ + hg_thread_mutex_lock(&ioc_thread_mutex); +} + +/*------------------------------------------------------------------------- + * Function: end_thread_exclusive + * + * Purpose: Mutex unlock. Should only be called by the current holder + * of the locked mutex. + * + * Return: result of mutex_unlock operation. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +void +end_thread_exclusive(void) +{ + hg_thread_mutex_unlock(&ioc_thread_mutex); +} + +/*------------------------------------------------------------------------- + * Function: wait_for_thread_main + * + * Purpose: Perform a thread_join on the IOC main thread. + * + * Return: SUCCESS (0) or FAIL (-1) if the thread_join + * does not succeed. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +wait_for_thread_main(void) +{ + if (hg_thread_join(ioc_thread) != 0) { + return -1; + } + return 0; +} + +/*------------------------------------------------------------------------- + * Function: H5FD_ioc_take_down_thread_pool + * + * Purpose: Destroy the thread pool if it exists. + * + * This function should only be called on shutdown after all + * pending I/O operations have completed. + * + * Return: void + * + * Programmer: JRM -- 10/27/21 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +void +H5FD_ioc_take_down_thread_pool(void) +{ + HDassert(0 == atomic_load(&sf_io_ops_pending)); + + if (ioc_thread_pool != NULL) { + hg_thread_pool_destroy(ioc_thread_pool); + ioc_thread_pool = NULL; + } + + return; + +} /* H5FD_ioc_take_down_thread_pool() */ + +#if 1 /* JRM */ /* dispatch code -- move elsewhere? */ + +/*------------------------------------------------------------------------- + * Function: H5FD_ioc__alloc_io_q_entry + * + * Purpose: Allocate and initialize an instance of + * H5FD_ioc_io_queue_entry_t. Return pointer to the new + * instance on success, and NULL on failure. + * + * Return: Pointer to new instance of H5FD_ioc_io_queue_entry_t + * on success, and NULL on failure. + * + * Programmer: JRM -- 11/6/21 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +/* TODO: update function when we decide how to handle error reporting in the IOCs */ +H5FD_ioc_io_queue_entry_t * +H5FD_ioc__alloc_io_q_entry(void) +{ + H5FD_ioc_io_queue_entry_t *q_entry_ptr = NULL; + + q_entry_ptr = (H5FD_ioc_io_queue_entry_t *)HDmalloc(sizeof(H5FD_ioc_io_queue_entry_t)); + + if (q_entry_ptr) { + + q_entry_ptr->magic = H5FD_IOC__IO_Q_ENTRY_MAGIC; + q_entry_ptr->next = NULL; + q_entry_ptr->prev = NULL; + q_entry_ptr->in_progress = FALSE; + q_entry_ptr->counter = 0; + + /* will memcpy the wk_req field, so don't bother to initialize */ + /* will initialize thread_wk field before use */ + +#if H5FD_IOC__COLLECT_STATS + q_entry_ptr->q_time = 0; + q_entry_ptr->dispatch_time = 0; +#endif /* H5FD_IOC__COLLECT_STATS */ + } + + return (q_entry_ptr); + +} /* H5FD_ioc__alloc_io_q_entry() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_ioc__complete_io_q_entry + * + * Purpose: Update the IOC I/O Queue for the completion of an I/O + * request. + * + * To do this: + * + * 1) Remove the entry from the I/O Queue + * + * 2) If so configured, update statistics + * + * 3) Discard the instance of H5FD_ioc_io_queue_entry_t. + * + * Return: void. + * + * Programmer: JRM -- 11/7/21 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +/* TODO: update function when we decide how to handle error reporting in the IOCs */ +/* TODO: Update for per file I/O Queue */ +void +H5FD_ioc__complete_io_q_entry(H5FD_ioc_io_queue_entry_t *entry_ptr) +{ +#if 0 /* H5FD_IOC__COLLECT_STATS */ + uint64_t queued_time; + uint64_t execution_time; +#endif /* H5FD_IOC__COLLECT_STATS */ + + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC); + + /* must obtain io_queue_g mutex before deleting and updating stats */ + hg_thread_mutex_lock(&(io_queue_g.q_mutex)); + + HDassert(io_queue_g.magic == H5FD_IOC__IO_Q_MAGIC); + HDassert(io_queue_g.num_pending + io_queue_g.num_in_progress == io_queue_g.q_len); + HDassert(io_queue_g.num_in_progress > 0); + + H5FD_IOC__Q_REMOVE(&io_queue_g, entry_ptr); + + io_queue_g.num_in_progress--; + + HDassert(io_queue_g.num_pending + io_queue_g.num_in_progress == io_queue_g.q_len); + + atomic_fetch_sub(&sf_io_ops_pending, 1); + +#if 0 /* JRM */ + HDfprintf(stdout, + "\n\nH5FD_ioc__complete_io_q_entry: request %d completed. op = %d, offset/len = %lld/%lld, q-ed/disp/ops_pend = %d/%d/%d.\n", + entry_ptr->counter, (entry_ptr->wk_req.tag), (long long)(entry_ptr->wk_req.header[1]), + (long long)(entry_ptr->wk_req.header[0]), io_queue_g.num_pending, io_queue_g.num_in_progress, + atomic_load(&sf_io_ops_pending)); + HDfflush(stdout); +#endif /* JRM */ + + HDassert(io_queue_g.q_len == atomic_load(&sf_io_ops_pending)); + +#if H5FD_IOC__COLLECT_STATS +#if 0 /* no place to collect this yet */ + /* Compute the queued and execution time */ + queued_time = entry_ptr->dispatch_time - entry_ptr->q_time; + execution_time = H5_now_usec() = entry_ptr->dispatch_time; +#endif + + io_queue_g.requests_completed++; + + entry_ptr->q_time = H5_now_usec(); + +#endif /* H5FD_IOC__COLLECT_STATS */ + + hg_thread_mutex_unlock(&(io_queue_g.q_mutex)); + + HDassert(entry_ptr->wk_req.buffer == NULL); + + H5FD_ioc__free_io_q_entry(entry_ptr); + + entry_ptr = NULL; + + return; + +} /* H5FD_ioc__complete_io_q_entry() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_ioc__dispatch_elegible_io_q_entries + * + * Purpose: Scan the IOC I/O Queue for dispatchable entries, and + * dispatch any such entries found. + * + * Do this by scanning the I/O queue from head to tail for + * entries that: + * + * 1) Have not already been dispatched + * + * 2) Either: + * + * a) do not intersect with any prior entries on the + * I/O queue, or + * + * b) Are read requests, and all intersections are with + * prior read requests. + * + * Dispatch any such entries found. + * + * Do this to maintain the POSIX semantics required by + * HDF5. + * + * Note that TRUNC_OPs and GET_EOF_OPs are a special case. + * Specifically, no I/O queue entry can be dispatched if + * there is a truncate or get EOF operation between it and + * the head of the queue. Further, a truncate or get EOF + * request cannot be executed unless it is at the head of + * the queue. + * + * Return: void. + * + * Programmer: JRM -- 11/7/21 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +/* TODO: update function when we decide how to handle error reporting in the IOCs */ +/* TODO: Update for per file I/O Queue */ +/* TODO: Keep an eye on statistics and optimize this algorithm if necessary. While it is O(N) + * where N is the number of elements in the I/O Queue if there are are no-overlaps, it + * can become O(N**2) in the worst case. + */ +void +H5FD_ioc__dispatch_elegible_io_q_entries(void) +{ + hbool_t conflict_detected; + int64_t entry_offset; + int64_t entry_len; + int64_t scan_offset; + int64_t scan_len; + H5FD_ioc_io_queue_entry_t *entry_ptr = NULL; + H5FD_ioc_io_queue_entry_t *scan_ptr = NULL; + + hg_thread_mutex_lock(&(io_queue_g.q_mutex)); + + HDassert(io_queue_g.magic == H5FD_IOC__IO_Q_MAGIC); + + entry_ptr = io_queue_g.q_head; + + /* sanity check on first element in the I/O queue */ + HDassert((entry_ptr == NULL) || (entry_ptr->prev == NULL)); + + while ((entry_ptr) && (io_queue_g.num_pending > 0)) { + + HDassert(entry_ptr->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC); + + if (!entry_ptr->in_progress) { + + entry_offset = entry_ptr->wk_req.header[1]; + entry_len = entry_ptr->wk_req.header[0]; + + conflict_detected = FALSE; + + scan_ptr = entry_ptr->prev; + + HDassert((scan_ptr == NULL) || (scan_ptr->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC)); + + if ((entry_ptr->wk_req.tag == TRUNC_OP) || (entry_ptr->wk_req.tag == GET_EOF_OP)) { + + if (scan_ptr != NULL) { + + /* the TRUNC_OP or GET_EOF_OP is not at the head of the queue, and thus cannot + * be dispatched. Further, no operation can be dispatched if a truncate request + * appears before it in the queue. Thus we have done all we can and will break + * out of the loop. + */ + break; + } + } + + while ((scan_ptr) && (!conflict_detected)) { + + /* check for overlaps */ + scan_offset = scan_ptr->wk_req.header[1]; + scan_len = scan_ptr->wk_req.header[0]; + + /* at present, I/O requests are scalar -- i.e. single blocks specified by offset and length. + * when this changes, this if statement will have to be updated accordingly. + */ + if (!(((scan_offset + scan_len) < entry_offset) || + ((entry_offset + entry_len) < scan_offset))) { + + /* the two request overlap -- unless they are both reads, we have detected a conflict */ + + /* TODO: update this if statement when we add collective I/O */ + if ((entry_ptr->wk_req.tag != READ_INDEP) || (scan_ptr->wk_req.tag != READ_INDEP)) { + + conflict_detected = TRUE; + } + } + + scan_ptr = scan_ptr->prev; + } + + if (!conflict_detected) { /* dispatch I/O request */ + + HDassert(scan_ptr == NULL); + HDassert(!entry_ptr->in_progress); + + entry_ptr->in_progress = TRUE; + + HDassert(io_queue_g.num_pending > 0); + + io_queue_g.num_pending--; + io_queue_g.num_in_progress++; + + HDassert(io_queue_g.num_pending + io_queue_g.num_in_progress == io_queue_g.q_len); + + entry_ptr->thread_wk.func = handle_work_request; + entry_ptr->thread_wk.args = entry_ptr; + +#if H5FD_IOC__COLLECT_STATS + if (io_queue_g.num_in_progress > io_queue_g.max_num_in_progress) { + + io_queue_g.max_num_in_progress = io_queue_g.num_in_progress; + } + + io_queue_g.requests_dispatched++; + +#if 0 /* JRM */ + HDfprintf(stdout, +"\n\nH5FD_ioc__dispatch_elegible_io_q_entries: request %d dispatched. op = %d, offset/len = %lld/%lld, q-ed/disp/ops_pend = %d/%d/%d.\n", + entry_ptr->counter, (entry_ptr->wk_req.tag), (long long)(entry_ptr->wk_req.header[1]), + (long long)(entry_ptr->wk_req.header[0]), io_queue_g.num_pending, io_queue_g.num_in_progress, + atomic_load(&sf_io_ops_pending)); + HDfflush(stdout); +#endif /* JRM */ + + entry_ptr->dispatch_time = H5_now_usec(); + +#endif /* H5FD_IOC__COLLECT_STATS */ + + hg_thread_pool_post(ioc_thread_pool, &(entry_ptr->thread_wk)); + } + } + else if ((entry_ptr->wk_req.tag == TRUNC_OP) || (entry_ptr->wk_req.tag == GET_EOF_OP)) { + + /* we have a truncate or get eof operation in progress -- thus no other operations + * can be dispatched until the truncate or get eof operation completes. Just break + * out of the loop. + */ + /* the truncate or get eof operation in progress must be at the head of the queue -- verify this + */ + HDassert(entry_ptr->prev == NULL); + + break; + } + + entry_ptr = entry_ptr->next; + } + + HDassert(io_queue_g.q_len == atomic_load(&sf_io_ops_pending)); + + hg_thread_mutex_unlock(&(io_queue_g.q_mutex)); + +} /* H5FD_ioc__dispatch_elegible_io_q_entries() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_ioc__free_io_q_entry + * + * Purpose: Free the supplied instance of H5FD_ioc_io_queue_entry_t. + * + * Verify that magic field is set to + * H5FD_IOC__IO_Q_ENTRY_MAGIC, and that the next and prev + * fields are NULL. + * + * Return: void. + * + * Programmer: JRM -- 11/6/21 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +/* TODO: update function when we decide how to handle error reporting in the IOCs */ +void +H5FD_ioc__free_io_q_entry(H5FD_ioc_io_queue_entry_t *q_entry_ptr) +{ + /* use assertions for error checking, since the following should never fail. */ + + HDassert(q_entry_ptr); + HDassert(q_entry_ptr->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC); + HDassert(q_entry_ptr->next == NULL); + HDassert(q_entry_ptr->prev == NULL); + HDassert(q_entry_ptr->wk_req.buffer == NULL); + + q_entry_ptr->magic = 0; + + HDfree(q_entry_ptr); + + q_entry_ptr = NULL; + + return; + +} /* H5FD_ioc__free_c_io_q_entry() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_ioc__queue_io_q_entry + * + * Purpose: Add an I/O request to the tail of the IOC I/O Queue. + * + * To do this, we must: + * + * 1) allocate a new instance of H5FD_ioc_io_queue_entry_t + * + * 2) Initialize the new instance and copy the supplied + * instance of sf_work_request_t into it. + * + * 3) Append it to the IOC I/O queue. + * + * Note that this does not dispatch the request even if it + * is eligible for immediate dispatch. This is done with + * a call to H5FD_ioc__dispatch_elegible_io_q_entries(). + * + * Return: void. + * + * Programmer: JRM -- 11/7/21 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +/* TODO: update function when we decide how to handle error reporting in the IOCs */ +/* TODO: Update for per file I/O Queue */ +void +H5FD_ioc__queue_io_q_entry(sf_work_request_t *wk_req_ptr) +{ + H5FD_ioc_io_queue_entry_t *entry_ptr = NULL; + + HDassert(wk_req_ptr); + HDassert(io_queue_g.magic == H5FD_IOC__IO_Q_MAGIC); + + entry_ptr = H5FD_ioc__alloc_io_q_entry(); + + HDassert(entry_ptr); + HDassert(entry_ptr->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC); + + HDmemcpy((void *)(&(entry_ptr->wk_req)), (const void *)wk_req_ptr, sizeof(sf_work_request_t)); + + /* must obtain io_queue_g mutex before appending */ + hg_thread_mutex_lock(&(io_queue_g.q_mutex)); + + HDassert(io_queue_g.q_len == atomic_load(&sf_io_ops_pending)); + + entry_ptr->counter = io_queue_g.req_counter++; + + io_queue_g.num_pending++; + + H5FD_IOC__Q_APPEND(&io_queue_g, entry_ptr); + + atomic_fetch_add(&sf_io_ops_pending, 1); + +#if 0 /* JRM */ + HDfprintf(stdout, + "\n\nH5FD_ioc__queue_io_q_entry: request %d queued. op = %d, offset/len = %lld/%lld, q-ed/disp/ops_pend = %d/%d/%d.\n", + entry_ptr->counter, (entry_ptr->wk_req.tag), (long long)(entry_ptr->wk_req.header[1]), + (long long)(entry_ptr->wk_req.header[0]), io_queue_g.num_pending, io_queue_g.num_in_progress, + atomic_load(&sf_io_ops_pending)); + HDfflush(stdout); +#endif /* JRM */ + + HDassert(io_queue_g.num_pending + io_queue_g.num_in_progress == io_queue_g.q_len); + +#if H5FD_IOC__COLLECT_STATS + + entry_ptr->q_time = H5_now_usec(); + + if (io_queue_g.q_len > io_queue_g.max_q_len) { + + io_queue_g.max_q_len = io_queue_g.q_len; + } + + if (io_queue_g.num_pending > io_queue_g.max_num_pending) { + + io_queue_g.max_num_pending = io_queue_g.num_pending; + } + + if (entry_ptr->wk_req.tag == READ_INDEP) { + + io_queue_g.ind_read_requests++; + } + else if (entry_ptr->wk_req.tag == WRITE_INDEP) { + + io_queue_g.ind_write_requests++; + } + else if (entry_ptr->wk_req.tag == TRUNC_OP) { + + io_queue_g.truncate_requests++; + } + else if (entry_ptr->wk_req.tag == GET_EOF_OP) { + + io_queue_g.get_eof_requests++; + } + + io_queue_g.requests_queued++; + +#endif /* H5FD_IOC__COLLECT_STATS */ + +#if 0 /* JRM */ + if ( io_queue_g.q_len != atomic_load(&sf_io_ops_pending) ) { + + HDfprintf(stdout, "\n\nH5FD_ioc__queue_io_q_entry: io_queue_g.q_len = %d != %d = atomic_load(&sf_io_ops_pending).\n\n", + io_queue_g.q_len, atomic_load(&sf_io_ops_pending)); + HDfflush(stdout); + } +#endif /* JRM */ + + HDassert(io_queue_g.q_len == atomic_load(&sf_io_ops_pending)); + + hg_thread_mutex_unlock(&(io_queue_g.q_mutex)); + + return; + +} /* H5FD_ioc__queue_io_q_entry() */ + +#endif /* JRM */ /* dispatch code -- move elsewhere? */ diff --git a/src/H5FDmpio.c b/src/H5FDmpio.c index 211da9cf5a2..98340758c83 100644 --- a/src/H5FDmpio.c +++ b/src/H5FDmpio.c @@ -1897,7 +1897,8 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou if (NULL == (sub_types = (int *)HDmalloc((size_t)count * sizeof(MPI_Datatype)))) HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types array") if (NULL == (sub_types_created = (uint8_t *)HDcalloc((size_t)count, 1))) { - sub_types = H5MM_free(sub_types); + /* MSB sub_types = H5MM_free(sub_types);*/ + H5MM_free(sub_types); HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types created array") } @@ -2076,6 +2077,7 @@ H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t cou /* Only retrieve bytes read if this rank _actually_ participated in I/O */ if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) { + /* How many bytes were actually read? */ #if MPI_VERSION >= 3 if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) @@ -2551,7 +2553,8 @@ H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t co if (NULL == (sub_types = (int *)HDmalloc((size_t)count * sizeof(MPI_Datatype)))) HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types array") if (NULL == (sub_types_created = (uint8_t *)HDcalloc((size_t)count, 1))) { - sub_types = H5MM_free(sub_types); + /*MSB sub_types = H5MM_free(sub_types);*/ + H5MM_free(sub_types); HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types created array") } diff --git a/src/H5FDpublic.h b/src/H5FDpublic.h index 221f569be0c..745d8aebcaf 100644 --- a/src/H5FDpublic.h +++ b/src/H5FDpublic.h @@ -54,6 +54,8 @@ #ifdef H5_HAVE_ROS3_VFD #define H5_VFD_ROS3 ((H5FD_class_value_t)(11)) #endif +#define H5_VFD_SUBFILING ((H5FD_class_value_t)(12)) +#define H5_VFD_IOC ((H5FD_class_value_t)(13)) /* VFD IDs below this value are reserved for library use. */ #define H5_VFD_RESERVED 256 diff --git a/src/H5FDsubfile_int.c b/src/H5FDsubfile_int.c new file mode 100644 index 00000000000..e6a0206f48b --- /dev/null +++ b/src/H5FDsubfile_int.c @@ -0,0 +1,1956 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Programmer: Richard Warren + * Wednesday, July 1, 2020 + * + * Purpose: This is part of a parallel subfiling I/O driver. + * + */ + +#include "H5FDsubfiling.h" + +/***********/ +/* Headers */ +/***********/ +#include "H5CXprivate.h" /* API Contexts */ +#include "H5Dprivate.h" /* Datasets */ +#include "H5Eprivate.h" /* Error handling */ +#include "H5Iprivate.h" /* IDs */ +#include "H5Ipublic.h" /* IDs */ +#include "H5MMprivate.h" /* Memory management */ +#include "H5Pprivate.h" /* Property lists */ +#include "H5private.h" /* Generic Functions */ + +/* +========================================= +Private functions +========================================= +*/ + +/* +-------------------------------------------------------------------------- +sf_context_limit -- How many contexts can be recorded (default = 4) +sf_context_entries -- The number of contexts that are currently recorded. +sf_context_cache -- Storage for contexts +-------------------------------------------------------------------------- +*/ +// static size_t twoGIG_LIMIT = (1 << 30); +static size_t sf_context_limit = 16; +static subfiling_context_t *sf_context_cache = NULL; +static size_t sf_topology_limit = 4; +static sf_topology_t * sf_topology_cache = NULL; +static app_layout_t * sf_app_layout = NULL; + +static file_map_to_context_t *sf_open_file_map = NULL; +static int sf_file_map_size = 0; +#define DEFAULT_MAP_ENTRIES 8 + +/* +--------------------------------------- + Recording subfiling related statistics +--------------------------------------- + */ +static stat_record_t subfiling_stats[TOTAL_STAT_COUNT]; +#define SF_WRITE_OPS (subfiling_stats[WRITE_STAT].op_count) +#define SF_WRITE_TIME (subfiling_stats[WRITE_STAT].total / (double)subfiling_stats[WRITE_STAT].op_count) +#define SF_WRITE_WAIT_TIME (subfiling_stats[WRITE_WAIT].total / (double)subfiling_stats[WRITE_WAIT].op_count) +#define SF_READ_OPS (subfiling_stats[READ_STAT].op_count) +#define SF_READ_TIME (subfiling_stats[READ_STAT].total / (double)subfiling_stats[READ_STAT].op_count) +#define SF_READ_WAIT_TIME (subfiling_stats[READ_WAIT].total / (double)subfiling_stats[READ_WAIT].op_count) +#define SF_QUEUE_DELAYS (subfiling_stats[QUEUE_STAT].total) + +#define SF_ALIGNMENT 8 + +static void +maybe_initialize_statistics(void) +{ + memset(subfiling_stats, 0, sizeof(subfiling_stats)); +} + +static void clear_fid_map_entry(uint64_t sf_fid); + +/* +========================================= +Public functions +========================================= +*/ + +/* +------------------------------------------------------------------------- + Programmer: Richard Warren + Purpose: Return a pointer to the requested storage object. + There are only 2 object types: TOPOLOGY or CONTEXT + structures. An object_id contains the object type + in upper 32 bits and an index value in the lower 32 bits. + Storage for an object is allocated as required. + + Topologies are static, i.e. for any one IO Concentrator + allocation strategy, the results should always be the + same. + FIXME: The one exception to this being the 1 IOC per + N MPI ranks. The value of N can be changed on a per-file + basis, so we need address that at some point. + + Contexts are 1 per open file. If only one file is open + at a time, then we will only use a single context cache + entry. + Errors: returns NULL if input SF_OBJ_TYPE is unrecognized or + a memory allocation error. + + Revision History -- Initial implementation +------------------------------------------------------------------------- +*/ +void * +get__subfiling_object(int64_t object_id) +{ + int obj_type = (int)((object_id >> 32) & 0x0FFFF); + /* We don't require a large indexing space + * 16 bits should be enough.. + */ + size_t index = (object_id & 0x0FFFF); + if (obj_type == SF_TOPOLOGY) { + /* We will likely only cache a single topology + * which is that of the original parallel application. + * In that context, we will identify the number of + * nodes along with the number of MPI ranks on a node. + */ + if (sf_topology_cache == NULL) { + sf_topology_cache = (sf_topology_t *)calloc(sf_topology_limit, sizeof(sf_topology_t)); + assert(sf_topology_cache != NULL); + } + if (index < sf_topology_limit) { + return (void *)&sf_topology_cache[index]; + } + else { + HDputs("Illegal toplogy object index"); + } + } + else if (obj_type == SF_CONTEXT) { + /* Contexts provide information principally about + * the application and how the data layout is managed + * over some number of sub-files. The important + * parameters are the number of subfiles (or in the + * context of IOCs, the MPI ranks and counts of the + * processes which host an IO Concentrator. We + * also provide a map of IOC rank to MPI rank + * to facilitate the communication of IO requests. + */ + if (sf_context_cache == NULL) { + sf_context_cache = (subfiling_context_t *)calloc(sf_context_limit, sizeof(subfiling_context_t)); + assert(sf_context_cache != NULL); + } + if (index == sf_context_limit) { + sf_context_limit *= 2; + sf_context_cache = (subfiling_context_t *)realloc(sf_context_cache, + sf_context_limit * sizeof(subfiling_context_t)); + assert(sf_context_cache != NULL); + } + else { + return (void *)&sf_context_cache[index]; + } + } + else { + printf("get__subfiling_object: UNKNOWN Subfiling object type id = 0x%lx\n", object_id); + } + return NULL; +} /* end get__subfiling_object() */ + +/*------------------------------------------------------------------------- + * Function: UTILITY FUNCTIONS: + * delete_subfiling_context - removes a context entry in the + * object cache. Free communicators + * and zero other structure fields. + * + * Return: none + * Errors: none + * + * Programmer: Richard Warren + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +void +delete_subfiling_context(hid_t context_id) +{ + subfiling_context_t *sf_context = get__subfiling_object(context_id); + if (sf_context) { + if (sf_context->topology->n_io_concentrators > 1) { + if (sf_context->sf_group_comm != MPI_COMM_NULL) { + MPI_Comm_free(&sf_context->sf_group_comm); + } + if (sf_context->sf_intercomm != MPI_COMM_NULL) { + MPI_Comm_free(&sf_context->sf_intercomm); + } + } + /* free(sf_context); */ + } + + return; +} + +/* +====================================================== +Public vars (for subfiling) and functions +We probably need a function to set and clear this +====================================================== +*/ +int sf_verbose_flag = 0; +int sf_open_file_count = 0; + +/*------------------------------------------------------------------------- + * Function: Public/Client set_verbose_flag + * + * Purpose: For debugging purposes, I allow a verbose setting to + * have printing of relevant information into an IOC specific + * file that is opened as a result of enabling the flag + * and closed when the verbose setting is disabled. + * + * Return: None + * Errors: None + * + * Programmer: Richard Warren + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +void +set_verbose_flag(int subfile_rank, int new_value) +{ +#ifndef NDEBUG + sf_verbose_flag = (int)(new_value & 0x0FF); + if (sf_verbose_flag) { + char logname[64]; + sprintf(logname, "ioc_%d.log", subfile_rank); + if (sf_open_file_count > 1) + sf_logfile = fopen(logname, "a+"); + else + sf_logfile = fopen(logname, "w+"); + } + else if (sf_logfile) { + fclose(sf_logfile); + sf_logfile = NULL; + } + +#endif + return; +} + +/*------------------------------------------------------------------------- + * Function: record_fid_to_subfile + * + * Purpose: Every opened HDF5 file will have (if utilizing subfiling) + * a subfiling context associated with it. It is important that + * the HDF5 file index is a constant rather than utilizing a + * posix file handle since files can be opened multiple times + * and with each file open, a new file handle will be assigned. + * Note that in such a case, the actual filesystem id will be + * retained. + * + * We utilize that filesystem id (ino_t inode) so that + * irrespective of what process opens a common file, the + * subfiling system will generate a consistent context for this + * file across all parallel ranks. + * + * This function simply records the filesystem handle to + * subfiling context mapping. + * + * Return: SUCCEED or FAIL. + * Errors: FAILs ONLY if storage for the mapping entry cannot + * be allocated. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static herr_t +record_fid_to_subfile(uint64_t fid, hid_t subfile_context_id, int *next_index) +{ + herr_t status = SUCCEED; + int index; + if (sf_file_map_size == 0) { + int i; + sf_open_file_map = + (file_map_to_context_t *)malloc((size_t)DEFAULT_MAP_ENTRIES * sizeof(file_map_to_context_t)); + if (sf_open_file_map == NULL) { + perror("malloc"); + return FAIL; + } + sf_file_map_size = DEFAULT_MAP_ENTRIES; + for (i = 0; i < sf_file_map_size; i++) { + sf_open_file_map[i].h5_file_id = (uint64_t)H5I_INVALID_HID; + sf_open_file_map[i].sf_context_id = 0; + } + } + for (index = 0; index < sf_file_map_size; index++) { + if (sf_open_file_map[index].h5_file_id == (uint64_t)H5I_INVALID_HID) { + sf_open_file_map[index].h5_file_id = fid; + sf_open_file_map[index].sf_context_id = subfile_context_id; + + if (next_index) { + *next_index = index; + } + return status; + } + } + if (index == sf_file_map_size) { + int i; + sf_open_file_map = + realloc(sf_open_file_map, ((size_t)(sf_file_map_size * 2) * sizeof(file_map_to_context_t))); + if (sf_open_file_map == NULL) { + perror("realloc"); + return FAIL; + } + sf_file_map_size *= 2; + for (i = index; i < sf_file_map_size; i++) { + sf_open_file_map[i].h5_file_id = (uint64_t)H5I_INVALID_HID; + } + + if (next_index) { + *next_index = index; + } + + sf_open_file_map[index].h5_file_id = fid; + sf_open_file_map[index++].sf_context_id = subfile_context_id; + } + return status; +} /* end record_fid_to_subfile() */ + +/*------------------------------------------------------------------------- + * Function: Internal open_subfile_with_context + * + * Purpose: While we cannot know a priori, whether an HDF client will + * need to access data across the entirety of a file, e.g. + * an individual MPI rank may read or write only small + * segments of the entire file space; this function sends + * a file OPEN_OP to every IO concentrator. + * + * Prior to opening any subfiles, the H5FDopen will have + * created an HDF5 file with the user specified naming. + * A path prefix will be selected and is available as + * an input argument. + * + * The opened HDF5 file handle will contain device and + * inode values, these being constant for all processes + * opening the shared file. The inode value is utilized + * as a key value and is associated with the sf_context + * which we receive as one of the input arguments. + * + * IO Concentrator threads will be initialized on MPI ranks + * which have been identified via application toplogy + * discovery. The number and mapping of IOC to MPI_rank + * is part of the sf_context->topology structure. + * + * Return: Success (0) or Faiure (non-zero) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +int +open_subfile_with_context(subfiling_context_t *sf_context, uint64_t fid, int flags) +{ + int ret; + int g_errors = 0; + int l_errors = 0; + double start_t = MPI_Wtime(); + assert(sf_context != NULL); + +#ifdef VERBOSE + printf("[%s %d]: context_id=%ld\n", __func__, sf_context->topology->app_layout->world_rank, + sf_context->sf_context_id); +#endif + + /* + * Save the HDF5 file id (fid) to subfile context mapping. + * There shouldn't be any issue, but check the status and + * return if there was a problem. + */ + + ret = record_fid_to_subfile(fid, sf_context->sf_context_id, NULL); + if (ret != SUCCEED) { + printf("[%d - %s] Error mapping hdf5 file to a subfiling context\n", + sf_context->topology->app_layout->world_rank, __func__); + return -1; + } + + if (sf_context->topology->rank_is_ioc) { + sf_work_request_t msg = {{flags, (int64_t)fid, sf_context->sf_context_id}, + OPEN_OP, + sf_context->topology->app_layout->world_rank, + sf_context->topology->subfile_rank, + sf_context->sf_context_id, + start_t, + NULL, + 0, + 0, + 0, + 0}; + + if (flags & O_CREAT) { + sf_context->sf_fid = -2; + } + + l_errors = subfiling_open_file(&msg, sf_context->topology->subfile_rank, flags); + } + + g_errors = l_errors; + + MPI_Allreduce(&l_errors, &g_errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + if (g_errors) { + printf("[%s %d]: error count = %d l_errors=%d\n", __func__, + sf_context->topology->app_layout->world_rank, g_errors, l_errors); + fflush(stdout); + } + return g_errors; +} /* end open_subfile_with_context() */ + +/*------------------------------------------------------------------------- + * Function: Internal close__subfiles + * + * Purpose: When closing and HDF5 file, we need to close any associated + * subfiles as well. This function cycles through all known + * IO Concentrators to send a file CLOSE_OP command. + * + * This function is collective across all MPI ranks which + * have opened HDF5 file which associated with the provided + * sf_context. Once the request has been issued by all + * ranks, the subfile at each IOC will be closed and an + * completion ACK will be received. + * + * Once the subfiles are closed, we initiate a teardown of + * the IOC and associated thread_pool threads. + * + * Return: Success (0) or Faiure (non-zero) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +static int +close__subfiles(subfiling_context_t *sf_context, uint64_t fid) +{ + int global_errors = 0, errors = 0; + int file_open_count; + int subfile_fid = 0; + double t0 = 0.0, t1 = 0.0, t2 = 0.0; + double t_main_exit = 0.0, t_finalize_threads = 0.0; + + HDassert((sf_context != NULL)); + t0 = MPI_Wtime(); + +/* TODO: can't use comm world here -- must use communicator set in the file open */ +/* Addendum: As mentioned earlier, it may be appropriate to copy the supplied + * communicator and use the copy here. + */ +//#if MPI_VERSION >= 3 && MPI_SUBVERSION >= 1 +#if 0 /* JRM */ /* Just use regular barrier */ + MPI_Request b_req = MPI_REQUEST_NULL; + int mpi_status = MPI_Ibarrier(MPI_COMM_WORLD, &b_req); + if (mpi_status == MPI_SUCCESS) { + int completed = 0; + while (!completed) { + useconds_t t_delay = 5; + usleep(t_delay); + mpi_status = MPI_Test(&b_req, &completed, MPI_STATUS_IGNORE); + if (mpi_status != MPI_SUCCESS) + completed = 1; + } + } +#else +#if 0 /* JRM */ /* delete this eventually */ + HDfprintf(stdout, "\n\nclose__subfiles: entering initial barrier.\n\n"); + HDfflush(stdout); +#endif /* JRM */ /* delete this eventually */ + + if (MPI_Barrier(MPI_COMM_WORLD) != MPI_SUCCESS) { + + HDfprintf(stdout, "close__subfiles: entering barrier failed.\n"); + HDfflush(stdout); + } +#endif + + /* We make the subfile close operation collective. + * Otherwise, there may be a race condition between + * our closing the subfiles and the user application + * moving ahead and possibly re-opening a file. + * + * If we can, we utilize an async barrier which gives + * us the opportunity to reduce the CPU load due to + * MPI spinning while waiting for the barrier to + * complete. This is especially important if there + * is heavy thread utilization due to subfiling + * activities, i.e. the thread pool might be + * extremely busy servicing IO requests from all + * HDF5 application ranks. + */ + /* The map from fid to context can now be cleared */ + clear_fid_map_entry(fid); + + if (sf_context->topology->rank_is_ioc) { + file_open_count = atomic_load(&sf_file_open_count); + atomic_fetch_sub(&sf_file_open_count, 1); + + /* If there's only a single file that is + * currently open, we can shutdown the IO concentrator + * as part of the file close. + */ +#if 0 /* JRM */ /* delete this if all goes well */ + if (file_open_count == 1) { + /* Shutdown the main IOC thread */ + H5FD_ioc_set_shutdown_flag(1); + /* Allow ioc_main to exit.*/ + usleep(20); + + t1 = MPI_Wtime(); + H5FD_ioc_wait_thread_main(); + t2 = MPI_Wtime(); + t1 = t2; + t_main_exit = t2 - t1; + H5FD_ioc_finalize_threads(); + + t2 = MPI_Wtime(); + } +#else /* JRM */ + if (file_open_count == 1) { + + HDassert(0 == atomic_load(&sf_shutdown_flag)); + + /* Shutdown the main IOC thread */ + atomic_init(&sf_shutdown_flag, 1); + + /* Allow ioc_main to exit.*/ + do { + + usleep(20); + + } while (0 != atomic_load(&sf_shutdown_flag)); + + t1 = MPI_Wtime(); + H5FD_ioc_wait_thread_main(); + t2 = MPI_Wtime(); + t1 = t2; + t_main_exit = t2 - t1; + + H5FD_ioc_take_down_thread_pool(); + + t2 = MPI_Wtime(); + } + +#endif /* JRM */ + + t_finalize_threads = t2 - t1; + + if ((subfile_fid = sf_context->sf_fid) > 0) { + if (HDclose(subfile_fid) < 0) { + perror("close(subfile_fid)"); + errors++; + } + else { + sf_context->sf_fid = -1; + } + } + +#ifndef NDEBUG + /* FIXME: If we've had multiple files open, our statistics + * will be messed up! + */ + if (sf_verbose_flag) { + t1 = t2; + if (sf_logfile != NULL) { + fprintf(sf_logfile, "[%d] main_exit=%lf, finalize_threads=%lf\n", sf_context->sf_group_rank, + t_main_exit, t_finalize_threads); + if (SF_WRITE_OPS > 0) + fprintf(sf_logfile, + "[%d] pwrite perf: wrt_ops=%ld wait=%lf pwrite=%lf IOC_shutdown = %lf seconds\n", + sf_context->sf_group_rank, SF_WRITE_OPS, SF_WRITE_WAIT_TIME, SF_WRITE_TIME, + (t1 - t0)); + if (SF_READ_OPS > 0) + fprintf(sf_logfile, + "[%d] pread perf: read_ops=%ld wait=%lf pread=%lf IOC_shutdown = %lf seconds\n", + sf_context->sf_group_rank, SF_READ_OPS, SF_READ_WAIT_TIME, SF_READ_TIME, + (t1 - t0)); + + fprintf(sf_logfile, "[%d] Avg queue time=%lf seconds\n", sf_context->sf_group_rank, + SF_QUEUE_DELAYS / (double)(SF_WRITE_OPS + SF_READ_OPS)); + + fflush(sf_logfile); + + fclose(sf_logfile); + sf_logfile = NULL; + } + } + +#endif + } + + /* Run another barrier to prevent some ranks from running ahead, + * and opening another file before this file is completely closed + * down. + * + * Note that we shouldn't be using MPI_COMM_WORLD in the barrier + * below -- it should either be the communicator the user gave us + * when opening the file, or possibly a copy of same. + * + * JRM -- 11/29/21 + */ + +#if 0 /* JRM */ /* delete this eventually */ + HDfprintf(stdout, "\n\nclose__subfiles: entering closing barrier.\n\n"); + HDfflush(stdout); +#endif /* JRM */ /* delete this eventually */ + + if (MPI_Barrier(MPI_COMM_WORLD) != MPI_SUCCESS) { + + HDfprintf(stdout, "close__subfiles: exiting barrier failed.\n"); + HDfflush(stdout); + } + + if (sf_context->h5_filename) { + free(sf_context->h5_filename); + sf_context->h5_filename = NULL; + } + if (sf_context->subfile_prefix) { + free(sf_context->subfile_prefix); + sf_context->subfile_prefix = NULL; + } + + MPI_Allreduce(&errors, &global_errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + +#ifndef NDEBUG + if (sf_verbose_flag) { + if (client_log != NULL) { + fclose(client_log); + client_log = NULL; + } + } +#endif + return global_errors; +} /* end close__subfiles() */ + +#define MIN_RETRIES 10 +/* +====================================================== +File functions + +The pread and pwrite posix functions are described as +being thread safe. +====================================================== +*/ + +int +sf_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank) +{ + int ret = 0; + int retries = MIN_RETRIES; + useconds_t delay = 100; + ssize_t bytes_read; + ssize_t bytes_remaining = (ssize_t)data_size; + char * this_buffer = data_buffer; + + while (bytes_remaining) { + if ((bytes_read = (ssize_t)pread(fd, this_buffer, (size_t)bytes_remaining, file_offset)) < 0) { + + perror("pread failed!"); + HDprintf("[ioc(%d) %s] pread(fd, buf, bytes_remaining=%ld, " + "file_offset =%ld)\n", + subfile_rank, __func__, bytes_remaining, file_offset); + HDfflush(stdout); + return -1; + } + else if (bytes_read > 0) { + /* reset retry params */ + retries = MIN_RETRIES; + delay = 100; + bytes_remaining -= bytes_read; +#ifdef VERBOSE + printf("[ioc(%d) %s]: read %ld bytes, remaining=%ld, file_offset=%ld\n", subfile_rank, __func__, + bytes_read, bytes_remaining, file_offset); + fflush(stdout); +#endif + this_buffer += bytes_read; + file_offset += bytes_read; + } + else { + if (retries == 0) { +#ifdef VERBOSE + printf("[ioc(%d) %s] TIMEOUT: file_offset=%ld, data_size=%ld\n", subfile_rank, __func__, + file_offset, data_size); + printf("[ioc(%d) %s] ERROR! read of 0 bytes == eof!\n", subfile_rank, __func__); + + fflush(stdout); +#endif + return -2; + } + retries--; + usleep(delay); + delay *= 2; + } + } + return ret; +} /* end sf_read_data() */ + +int +sf_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank) +{ + int ret = 0; + char * this_data = (char *)data_buffer; + ssize_t bytes_remaining = (ssize_t)data_size; + ssize_t written = 0; + while (bytes_remaining) { + if ((written = pwrite(fd, this_data, (size_t)bytes_remaining, file_offset)) < 0) { + int saved_errno = errno; + struct stat statbuf; + perror("pwrite failed!"); + HDprintf("\nerrno = %d (%s)\n\n", saved_errno, strerror(saved_errno)); + fstat(fd, &statbuf); + HDprintf("[ioc(%d) %s] pwrite(fd, data, bytes_remaining=%ld, " + "file_offset=%ld), fd=%d, st_size=%ld\n", + subfile_rank, __func__, bytes_remaining, file_offset, fd, statbuf.st_size); + HDfflush(stdout); + return -1; + } + else { + bytes_remaining -= written; +#ifdef VERBOSE + printf("[ioc(%d) %s]: wrote %ld bytes, remaining=%ld, file_offset=%ld\n", subfile_rank, __func__, + written, bytes_remaining, file_offset); + fflush(stdout); +#endif + this_data += written; + file_offset += written; + } + } + /* We don't usually use this for each file write. We usually do the file + * flush as part of file close operation. + */ +#ifdef SUBFILE_REQUIRE_FLUSH + fdatasync(fd); +#endif + return ret; +} /* end sf_write_data() */ + +int +sf_truncate(int fd, int64_t length, int subfile_rank) +{ + int ret = 0; + + if (HDftruncate(fd, (off_t)length) != 0) { + + HDfprintf(stdout, "ftruncate failed on subfile rank %d. errno = %d (%s)\n", subfile_rank, errno, + strerror(errno)); + fflush(stdout); + ret = -1; + } + +#ifdef VERBOSE + HDprintf("[ioc(%d) %s]: truncated subfile to %lld bytes. ret = %d\n", subfile_rank, __func__, + (long long)length, ret); + HDfflush(stdout); +#endif + + return ret; +} /* end sf_truncate() */ + +/* + * --------------------------------------------------- + * Topology discovery related functions for choosing + * IO Concentrator (IOC) ranks. + * Currently, the default approach for assigning an IOC + * is select the lowest MPI rank on each node. + * + * The approach collectively generates N tuples + * consisting of the MPI rank and hostid. This + * collection is then sorted by hostid and scanned + * to identify the IOC ranks. + * + * As time permits, addition assignment methods will + * be implemented, e.g. 1-per-Nranks or via a config + * option. Additional selection methodologies can + * be included as users get more experience using the + * subfiling implementation. + * --------------------------------------------------- + */ + +/*------------------------------------------------------------------------- + * Function: compare_hostid + * + * Purpose: qsort sorting function. + * Compares tuples of 'layout_t'. The sorting is based on + * the long hostid values. + * + * Return: result of: (hostid1 > hostid2) + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static int +compare_hostid(const void *h1, const void *h2) +{ + const layout_t *host1 = (const layout_t *)h1; + const layout_t *host2 = (const layout_t *)h2; + return (host1->hostid > host2->hostid); +} + +/*------------------------------------------------------------------------- + * Function: gather_topology_info + * + * Purpose: Collectively generate a sorted collection of hostid+mpi_rank + * tuples. The result is returned in the 'topology' field + * of the sf_topology_t structure. + * + * Return: Sorted array of hostid/mpi_rank tuples. + * Errors: MPI_Abort if memory cannot be allocated. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static void +gather_topology_info(sf_topology_t *info) +{ + int sf_world_size; + int sf_world_rank; + app_layout_t *app_layout = NULL; + + HDassert(info != NULL); + app_layout = info->app_layout; + HDassert(app_layout != NULL); + + sf_world_size = app_layout->world_size; + sf_world_rank = app_layout->world_rank; + + if (1) { + long hostid = gethostid(); + layout_t my_hostinfo; + if (app_layout->layout == NULL) { + app_layout->layout = (layout_t *)calloc((size_t)sf_world_size + 1, sizeof(layout_t)); + HDassert(app_layout->layout != NULL); + } + + app_layout->hostid = hostid; + my_hostinfo.rank = sf_world_rank; + my_hostinfo.hostid = hostid; + app_layout->layout[sf_world_rank] = my_hostinfo; + if (sf_world_size > 1) { + if (MPI_Allgather(&my_hostinfo, 2, MPI_LONG, app_layout->layout, 2, MPI_LONG, MPI_COMM_WORLD) == + MPI_SUCCESS) { + qsort(app_layout->layout, (size_t)sf_world_size, sizeof(layout_t), compare_hostid); + } + } + } +} /* end gather_topology_info() */ + +/*------------------------------------------------------------------------- + * Function: count_nodes + * + * Purpose: Initializes the sorted collection of hostid+mpi_rank + * tuples. After initialization, the collection is scanned + * to determine the number of unique hostid entries. This + * value will determine the number of actual IO concentrators + * that available to the application. A side effect is to + * identify the 'node_index' of the current process. + * + * Return: The number of unique hostid's (nodes). + * Errors: MPI_Abort if memory cannot be allocated. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static int +count_nodes(sf_topology_t *info, int my_rank) +{ + int k, node_count, hostid_index = -1; + app_layout_t *app_layout = NULL; + long nextid; + + HDassert(info != NULL); + app_layout = info->app_layout; + + if ((node_count = app_layout->node_count) == 0) + gather_topology_info(info); + + HDassert(app_layout->node_ranks); + + nextid = app_layout->layout[0].hostid; + /* Possibly record my hostid_index */ + if (app_layout->layout[0].rank == my_rank) { + hostid_index = 0; + } + + app_layout->node_ranks[0] = 0; /* Add index */ + node_count = 1; + + /* Recall that the topology array has been sorted! */ + for (k = 1; k < app_layout->world_size; k++) { + /* Possibly record my hostid_index */ + if (app_layout->layout[k].rank == my_rank) + hostid_index = k; + if (app_layout->layout[k].hostid != nextid) { + nextid = app_layout->layout[k].hostid; + /* Record the index of new hostid */ + app_layout->node_ranks[node_count++] = k; + } + } + + /* Mark the end of the node_ranks */ + app_layout->node_ranks[node_count] = app_layout->world_size; + /* Save the index where we first located my hostid */ + app_layout->node_index = hostid_index; + return app_layout->node_count = node_count; +} /* end count_nodes() */ + +/*------------------------------------------------------------------------- + * Function: identify_ioc_ranks + * + * Purpose: We've already identified the number of unique nodes and + * have a sorted list layout_t structures. Under normal + * conditions, we only utilize a single IOC per node. Under + * that circumstance, we only need to fill the io_concentrator + * vector from the node_ranks array (which contains the index + * into the layout array of lowest MPI rank on each node) into + * the io_concentrator vector; + * Otherwise, while determining the number of local_peers per + * node, we can also select one or more additional IOCs. + * + * As a side effect, we fill the 'ioc_concentrator' vector + * and set the 'rank_is_ioc' flag to TRUE if our rank is + * identified as owning an IO Concentrator (IOC). + * + *------------------------------------------------------------------------- + */ + +static int +identify_ioc_ranks(int node_count, int iocs_per_node, sf_topology_t *info) +{ + int n; + int total_ioc_count = 0; + app_layout_t *app_layout = NULL; + HDassert(info != NULL); + app_layout = info->app_layout; + + for (n = 0; n < node_count; n++) { + int k; + int node_index = app_layout->node_ranks[n]; + int local_peer_count = app_layout->node_ranks[n + 1] - app_layout->node_ranks[n]; + info->io_concentrator[total_ioc_count++] = (int)(app_layout->layout[node_index++].rank); + + if (app_layout->layout[node_index - 1].rank == app_layout->world_rank) { + info->subfile_rank = total_ioc_count - 1; + info->rank_is_ioc = TRUE; + } + + for (k = 1; k < iocs_per_node; k++) { + if (k < local_peer_count) { + if (app_layout->layout[node_index].rank == app_layout->world_rank) { + info->rank_is_ioc = TRUE; + info->subfile_rank = total_ioc_count; + } + info->io_concentrator[total_ioc_count++] = (int)(app_layout->layout[node_index++].rank); + } + } + } + + info->n_io_concentrators = total_ioc_count; + return total_ioc_count; +} /* end identify_ioc_ranks() */ + +static inline void +assign_ioc_ranks(int *io_concentrator, int ioc_count, int rank_multiple, sf_topology_t *app_topology) +{ + app_layout_t *app_layout = NULL; + /* Validate that the input pointers are not NULL */ + HDassert(io_concentrator); + HDassert(app_topology); + HDassert((app_layout = app_topology->app_layout) != NULL); + /* fill the io_concentrator values based on the application layout */ + if (io_concentrator) { + int k, ioc_next, ioc_index; + for (k = 0, ioc_next = 0; ioc_next < ioc_count; ioc_next++) { + ioc_index = rank_multiple * k++; + io_concentrator[ioc_next] = (int)(app_layout->layout[ioc_index].rank); + if (io_concentrator[ioc_next] == app_layout->world_rank) + app_topology->rank_is_ioc = TRUE; + } + app_topology->n_io_concentrators = ioc_count; + } +} /* end assign_ioc_ranks() */ + +/*------------------------------------------------------------------------- + * Function: fid_map_to_context + * + * Purpose: This is a basic lookup function which returns the subfiling + * context id associated with the specified file->inode. + * + * Return: The Subfiling context ID if it exists. + * Errors: H5I_INVALID_HID if the inode to context map is not found. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +hid_t +fid_map_to_context(uint64_t sf_fid) +{ + if (sf_open_file_map) { + int i; + for (i = 0; i < sf_file_map_size; i++) { + hid_t sf_context_id = sf_open_file_map[i].sf_context_id; + if (sf_open_file_map[i].h5_file_id == sf_fid) { + return sf_context_id; + } + } + } + return H5I_INVALID_HID; +} /* end fid_map_to_context() */ + +/*------------------------------------------------------------------------- + * Function: clear_fid_map_entry + * + * Purpose: Remove the map entry associated with the file->inode. + * This is done at file close. + * + * Return: None + * Errors: Cannot fail. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static void +clear_fid_map_entry(uint64_t sf_fid) +{ + if (sf_open_file_map) { + int i; + for (i = 0; i < sf_file_map_size; i++) { + if (sf_open_file_map[i].h5_file_id == sf_fid) { + sf_open_file_map[i].h5_file_id = (uint64_t)H5I_INVALID_HID; + sf_open_file_map[i].sf_context_id = 0; + return; + } + } + } +} /* end clear_fid_map_entry() */ + +/*------------------------------------------------------------------------- + * Function: active_map_entries + * + * Purpose: Count the number of entries that have valid h5_file_id + * values. + * + * Return: The number of active map entries (can be zero). + * Errors: Cannot fail. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +active_map_entries(void) +{ + int i, map_entries = 0; + for (i = 0; i < sf_file_map_size; i++) { + if (sf_open_file_map[i].h5_file_id != (uint64_t)H5I_INVALID_HID) { + map_entries++; + } + } + return map_entries; +} /* end active_map_entries() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__determine_ioc_count + * + * Purpose: Once a sorted collection of hostid/mpi_rank tuples has been + * created and the number of unique hostids (nodes) has + * been determined, we may modify this "default" value for + * the number of IO Concentrators for this application. + * + * The default of one(1) IO concentrator per node can be + * changed (principally for testing) by environment variable. + * if IOC_COUNT_PER_NODE is defined, then that integer value + * is utilized as a multiplier to modify the set of + * IO Concentrator ranks. + * + * The cached results will be replicated within the + * subfiling_context_t structure and is utilized as a map from + * io concentrator rank to MPI communicator rank for message + * sends and receives. + * + * Return: The number of IO Concentrator ranks. We also cache + * the MPI ranks in the 'io_concentrator' vector variable. + * The length of this vector is cached as 'n_io_concentrators'. + * Errors: MPI_Abort if memory cannot be allocated. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: - Initial Version/None. + * - Updated the API to allow a variety of methods for + * determining the number and MPI ranks that will have + * IO Concentrators. The default approach will define + * a single IOC per node. + * + *------------------------------------------------------------------------- + */ +int +H5FD__determine_ioc_count(int world_size, int world_rank, ioc_selection_t ioc_select_method, + char *ioc_select_option, sf_topology_t **thisapp) +{ + int ioc_count = 0; + ioc_selection_t ioc_selection = ioc_selection_options; + /* Once the application layout is determined, + * we should be able to reuse the structure for every + * file open. + */ + app_layout_t * app_layout = sf_app_layout; + sf_topology_t *app_topology = NULL; + + HDassert(thisapp != NULL); + + if (thisapp) { + int rank_multiple = 0; + int iocs_per_node = 1; + char *envValue = NULL; + int * io_concentrator = NULL; + + if ((app_topology = *thisapp) == NULL) { + app_topology = (sf_topology_t *)HDmalloc(sizeof(sf_topology_t)); + HDassert(app_topology != NULL); + memset(app_topology, 0, sizeof(sf_topology_t)); + } + if (app_layout == NULL) { + /* do a single allocation to encompass the app_layout_t + * and all of it's elements (layout and node_ranks). + */ + size_t node_rank_size = sizeof(int) * (size_t)((world_size + 1)); + size_t layout_size = sizeof(layout_t) * (size_t)((world_size + 1)); + size_t alloc_size = sizeof(app_layout_t) + node_rank_size + layout_size; + app_layout = (app_layout_t *)HDmalloc(alloc_size); + HDassert(app_layout != NULL); + HDmemset(app_layout, 0, alloc_size); + app_layout->node_ranks = (int *)&app_layout[1]; + app_layout->layout = (layout_t *)&app_layout->node_ranks[world_size + 2]; + } + + /* Once the application layout has been filled once, any additional + * file open operations won't be required to gather that information. + */ + app_topology->app_layout = app_layout; + app_layout->world_size = world_size; + app_layout->world_rank = world_rank; + if (app_topology->io_concentrator == NULL) { + app_topology->io_concentrator = io_concentrator = + (int *)HDcalloc((size_t)world_size, sizeof(int)); + } + HDassert(io_concentrator != NULL); + app_topology->selection_type = ioc_selection = ioc_select_method; + + if (ioc_select_method == SELECT_IOC_WITH_CONFIG) { + HDputs("SELECT_IOC_WITH_CONFIG: not supported yet..."); + ioc_select_method = SELECT_IOC_ONE_PER_NODE; + goto next; + } + if (ioc_select_method == SELECT_IOC_TOTAL) { + if (ioc_select_option) { + int checkValue = atoi(ioc_select_option); + if ((checkValue <= 0) || (checkValue >= world_size)) { + ioc_select_method = SELECT_IOC_ONE_PER_NODE; + goto next; + } + + ioc_count = checkValue; + rank_multiple = (world_size / checkValue); + assign_ioc_ranks(io_concentrator, ioc_count, rank_multiple, app_topology); + *thisapp = app_topology; + } + else { + HDputs("Missing option argument!"); + ioc_select_method = SELECT_IOC_ONE_PER_NODE; + goto next; + } + } + if (ioc_select_method == SELECT_IOC_EVERY_NTH_RANK) { + /* This is similar to the previous method (SELECT_IOC_TOTAL) + * in that the user chooses a rank multiple rather than an + * absolute number of IO Concentrators. Unlike the former, + * we always start our selection with rank zero (0) and + * the apply the stride to identify other IOCs. + */ + if (ioc_select_option) { + int checkValue = atoi(ioc_select_option); + if (checkValue == 0) { /* Error */ + ioc_select_method = SELECT_IOC_ONE_PER_NODE; + goto next; + } + rank_multiple = checkValue; + ioc_count = (world_size / rank_multiple); + + if ((world_size % rank_multiple) != 0) { + ioc_count++; + } + + assign_ioc_ranks(io_concentrator, ioc_count, rank_multiple, app_topology); + *thisapp = app_topology; + } + else { + HDputs("Missing option argument!"); + ioc_select_method = SELECT_IOC_ONE_PER_NODE; + } + } + +next: + + if (ioc_select_method == SELECT_IOC_ONE_PER_NODE) { + app_topology->selection_type = ioc_select_method; + app_topology->app_layout = app_layout; + sf_app_layout = app_layout; + ioc_count = count_nodes(app_topology, world_rank); + + if ((envValue = HDgetenv("H5_IOC_COUNT_PER_NODE")) != NULL) { + int value_check = atoi(envValue); + if (value_check > 0) { + iocs_per_node = value_check; + } + } + ioc_count = identify_ioc_ranks(ioc_count, iocs_per_node, app_topology); + } + + if (ioc_count > 0) { + app_topology->n_io_concentrators = ioc_count; + /* Create a vector of "potential" file descriptors + * which can be indexed by the IOC id. + */ + app_topology->subfile_fd = (int *)HDcalloc((size_t)ioc_count, sizeof(int)); + if (app_topology->subfile_fd == NULL) { + HDputs("Failed to allocate vector of subfile fds"); + } + *thisapp = app_topology; + } + } + else { + HDputs("Unable to create app_toplogy"); + } + +#if 0 /* JRM */ + HDfprintf(stdout, "\n\nH5FD__determine_ioc_count: ioc_count = %d \n\n", ioc_count); + HDfflush(stdout); +#endif /* JRM */ + + return ioc_count; +} /* end H5FD__determine_ioc_count() */ + +/* +------------------------------------------------------------------------- + Programmer: Richard Warren + Purpose: Return a character string which represents either the + default selection method: SELECT_IOC_ONE_PER_NODE; or + if the user has selected a method via the environment + variable (H5_IOC_SELECTION_CRITERIA), we return that + along with any optional qualifier with for that method. + + Errors: None. + + Revision History -- Initial implementation +------------------------------------------------------------------------- +*/ +char * +get_ioc_selection_criteria(ioc_selection_t *selection) +{ + char *optValue = NULL; + char *envValue = HDgetenv("H5_IOC_SELECTION_CRITERIA"); + + /* For non-default options, the environment variable + * should have the following form: integer:[integer|string] + * In particular, EveryNthRank == 1:64 or every 64 ranks assign an IOC + * or WithConfig == 2:/ + */ + if (envValue && (optValue = strchr(envValue, ':'))) { + *optValue++ = 0; + } + if (envValue) { + int checkValue = atoi(envValue); + if ((checkValue < 0) || (checkValue >= ioc_selection_options)) { + *selection = SELECT_IOC_ONE_PER_NODE; + return NULL; + } + else { + *selection = (ioc_selection_t)checkValue; + return optValue; + } + } + *selection = SELECT_IOC_ONE_PER_NODE; + return NULL; +} /* end get_ioc_selection_criteria() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__init_subfile_context + * + * Purpose: Called as part of the HDF5 file + subfiling opening. + * This initializes the subfiling context and associates + * this context with the specific HDF5 file. + * + * Return: Success (0) or Faiure (-1) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +int +H5FD__init_subfile_context(sf_topology_t *thisApp, int n_iocs, int world_rank, + subfiling_context_t *newContext) +{ + MPI_Comm sf_msg_comm = MPI_COMM_NULL; + MPI_Comm sf_data_comm = MPI_COMM_NULL; + + assert(newContext != NULL); + if (newContext->topology == NULL) { + int status; + char *envValue = NULL; + + newContext->topology = thisApp; + newContext->sf_msg_comm = MPI_COMM_NULL; + newContext->sf_data_comm = MPI_COMM_NULL; + newContext->sf_group_comm = MPI_COMM_NULL; + newContext->sf_intercomm = MPI_COMM_NULL; + newContext->sf_stripe_size = H5FD_DEFAULT_STRIPE_DEPTH; + newContext->sf_write_count = 0; + newContext->sf_read_count = 0; + newContext->sf_eof = 0; + + if ((envValue = HDgetenv("H5_IOC_STRIPE_SIZE")) != NULL) { + long value_check = atol(envValue); + if (value_check > 0) { + newContext->sf_stripe_size = (int64_t)value_check; + } + } + if ((envValue = HDgetenv("H5_IOC_SUBFILE_PREFIX")) != NULL) { + char temp[PATH_MAX]; + sprintf(temp, "%s", envValue); + newContext->subfile_prefix = strdup(temp); + /* sf_subfile_prefix = strdup(temp); */ + } + + newContext->sf_blocksize_per_stripe = newContext->sf_stripe_size * n_iocs; + if (sf_msg_comm == MPI_COMM_NULL) { + status = MPI_Comm_dup(MPI_COMM_WORLD, &newContext->sf_msg_comm); + if (status != MPI_SUCCESS) + goto err_exit; + status = MPI_Comm_set_errhandler(newContext->sf_msg_comm, MPI_ERRORS_RETURN); + if (status != MPI_SUCCESS) + goto err_exit; + sf_msg_comm = newContext->sf_msg_comm; + } + if (sf_data_comm == MPI_COMM_NULL) { + status = MPI_Comm_dup(MPI_COMM_WORLD, &newContext->sf_data_comm); + if (status != MPI_SUCCESS) + goto err_exit; + status = MPI_Comm_set_errhandler(newContext->sf_data_comm, MPI_ERRORS_RETURN); + if (status != MPI_SUCCESS) + goto err_exit; + sf_data_comm = newContext->sf_data_comm; + } + if (n_iocs > 1) { + status = + MPI_Comm_split(MPI_COMM_WORLD, thisApp->rank_is_ioc, world_rank, &newContext->sf_group_comm); + + if (status != MPI_SUCCESS) + goto err_exit; + status = MPI_Comm_size(newContext->sf_group_comm, &newContext->sf_group_size); + if (status != MPI_SUCCESS) + goto err_exit; + status = MPI_Comm_rank(newContext->sf_group_comm, &newContext->sf_group_rank); + if (status != MPI_SUCCESS) + goto err_exit; + /* + * There may be additional functionality we need for the IOCs... + * If so, then can probably initialize those things here! + */ + } + else { + newContext->sf_group_size = 1; + newContext->sf_group_rank = 0; + } + } + return 0; + +err_exit: + return -1; +} /* end H5FD__init_subfile_context() */ + +/* +------------------------------------------------------------------------- + Programmer: Richard Warren + Purpose: Called as part of a file open operation, we initialize a + subfiling context which includes the application topology + along with other relevant info such as the MPI objects + (communicators) for communicating with IO concentrators. + We also identify which MPI ranks will have IOC threads + started on them. + + We return a context ID via the 'sf_context' variable. + + Errors: returns an error if we detect any initialization errors, + including malloc failures or any resource allocation + problems. + + Revision History -- Initial implementation +------------------------------------------------------------------------- +*/ +herr_t +H5FDsubfiling_init(ioc_selection_t ioc_select_method, char *ioc_select_option, int64_t *sf_context) +{ + herr_t ret_value = SUCCEED; + size_t alloc_size; + int ioc_count; + int world_rank, world_size; + sf_topology_t * thisApp = NULL; + int file_index = active_map_entries(); + int64_t tag = SF_CONTEXT; + int64_t context_id = ((tag << 32) | file_index); + subfiling_context_t *newContext = (subfiling_context_t *)get__subfiling_object(context_id); + char * envValue = NULL; + + FUNC_ENTER_API(FAIL) + H5TRACE3("e", "IO*s*!", ioc_select_method, ioc_select_option, sf_context); + + if (MPI_Comm_size(MPI_COMM_WORLD, &world_size) != MPI_SUCCESS) { + HDputs("MPI_Comm_size returned an error"); + ret_value = FAIL; + goto done; + } + if (MPI_Comm_rank(MPI_COMM_WORLD, &world_rank) != MPI_SUCCESS) { + HDputs("MPI_Comm_rank returned an error"); + ret_value = FAIL; + goto done; + } + + alloc_size = sizeof(sf_topology_t); + thisApp = HDmalloc(alloc_size); + HDassert(thisApp); + + HDmemset(thisApp, 0, alloc_size); + + /* Compute the number an distribution map of the set of IO Concentrators */ + if ((ioc_count = H5FD__determine_ioc_count(world_size, world_rank, ioc_select_method, ioc_select_option, + &thisApp)) <= 0) { + HDputs("Unable to register subfiling topology!"); + ret_value = FAIL; + goto done; + } + + newContext->sf_context_id = context_id; + + /* Maybe set the verbose flag for more debugging info */ + envValue = HDgetenv("H5_SF_VERBOSE_FLAG"); + if (envValue != NULL) { + int check_value = atoi(envValue); + if (check_value > 0) + sf_verbose_flag = 1; + } + + /* Maybe open client-side log files */ + if (sf_verbose_flag) { + manage_client_logfile(world_rank, sf_verbose_flag); + } + + if (H5FD__init_subfile_context(thisApp, ioc_count, world_rank, newContext) != SUCCEED) { + HDputs("Unable to initialize a subfiling context!"); + ret_value = FAIL; + goto done; + } + + if (context_id < 0) { + ret_value = FAIL; + goto done; + } + + newContext->sf_base_addr = 0; + if (newContext->topology->rank_is_ioc) { + newContext->sf_base_addr = (int64_t)(newContext->topology->subfile_rank * newContext->sf_stripe_size); + } + *sf_context = context_id; + +done: + + FUNC_LEAVE_API(ret_value) + return ret_value; +} /* end H5FDsubfiling_init() */ + +/*------------------------------------------------------------------------- + * Function: Public/Client H5FD__open_subfiles + * + * Purpose: Wrapper for the internal 'open__subfiles' function + * Similar to the other public wrapper functions, we + * discover (via the sf_context) the number of io concentrators + * and pass that to the internal function so that vector + * storage arrays can be stack based rather than explicitly + * allocated and freed. + * + * The Internal function is responsible for sending all IOC + * instances, the (sub)file open requests. + * + * Prior to calling the internal open function, we initialize + * a new subfiling context that contains topology info and + * new MPI communicators that facilitate messaging between + * HDF5 clients and the IOCs. + * + * Return: Success (0) or Faiure (non-zero) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +int +H5FD__open_subfiles(void *_config_info, uint64_t h5_file_id, int flags) +{ + int status; + int64_t context_id = -1; + subfiling_context_t *sf_context = NULL; + ioc_selection_t ioc_selection; + // char filepath[PATH_MAX]; + // char *slash; + config_common_t *config_info = _config_info; + char * option_arg = get_ioc_selection_criteria(&ioc_selection); + + HDassert(config_info); + /* Check to see who is calling the function:: + * We only allow the ioc or subfiling VFDs + */ + if ((config_info->magic != H5FD_IOC_FAPL_T_MAGIC) && + (config_info->magic != H5FD_SUBFILING_FAPL_T_MAGIC)) { + HDputs("Unrecgonized driver!"); + return -1; + } + + /* Initialize/identify IO Concentrators based on the + * config information that we have... + */ + status = H5FDsubfiling_init(ioc_selection, option_arg, &context_id); + if (status != SUCCEED) { + HDputs("H5FDsubfiling_init failed!"); + return -1; + } + + /* For statistics gathering */ + maybe_initialize_statistics(); + + /* Create a new context which is associated with + * this file (context_id) + */ + sf_context = get__subfiling_object(context_id); + assert(sf_context != NULL); + + /* Save some basic things in the new context */ + config_info->context_id = context_id; + sf_context->sf_fid = 0; + sf_context->sf_context_id = context_id; + sf_context->h5_file_id = h5_file_id; + sf_context->h5_filename = strdup(config_info->file_path); + sf_context->sf_filename = NULL; + /* Ensure that the IOC service won't exit + * as we prepare to start up.. + */ +#if 0 /* JRM */ /* delete if all goes well */ + H5FD_ioc_set_shutdown_flag(0); +#else /* JRM */ + atomic_init(&sf_shutdown_flag, 0); +#endif /* JRM */ + + /* If we're actually using the IOCs, we will + * start the service threads on the identified + * ranks as part of the subfile opening. + */ + return open_subfile_with_context(sf_context, h5_file_id, flags); +} + +/*------------------------------------------------------------------------- + * Function: Public/Client H5FD__close_subfiles + * + * Purpose: This is a simple wrapper function for the internal version + * which actually manages all subfile closing via commands + * to the set of IO Concentrators. + * + * Return: Success (0) or Faiure (non-zero) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +int +H5FD__close_subfiles(int64_t context_id) +{ + subfiling_context_t *sf_context = get__subfiling_object(context_id); + assert(sf_context != NULL); + return close__subfiles(sf_context, sf_context->h5_file_id); +} + +/*------------------------------------------------------------------------- + * Function: H5FD__subfiling__truncate_sub_files + * + * Note: This code should be moved -- most likely to the IOC + * code files. + * + * Purpose: Apply a truncate operation to the sub-files. + * + * In the context of the I/O concentrators, the eof must be + * translated into the appropriate value for each of the + * sub-files, and then applied to same. + * + * Further, we must ensure that all prior I/O requests complete + * before the truncate is applied. + * + * We do this as follows: + * + * 1) Run a barrier on entry. + * + * 2) Determine if this rank is a IOC. If it is, compute + * the correct EOF for this sub-file, and send a truncate + * request to the IOC. + * + * 3) On the IOC thread, allow all pending I/O requests + * received prior to the truncate request to complete + * before performing the truncate. + * + * 4) Run a barrier on exit. + * + * Observe that the barrier on entry ensures that any prior + * I/O requests will have been queue before the truncate + * request is sent to the IOC. + * + * Similarly, the barrier on exit ensures that no subsequent + * I/O request will reach the IOC before the truncate request + * has been queued. + * + * Return: SUCCEED/FAIL + * + * Programmer: JRM -- 12/13/21 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +herr_t +H5FD__subfiling__truncate_sub_files(int64_t logical_file_eof, hid_t context_id) +{ + int mpi_code; /* MPI return code */ + MPI_Comm comm = MPI_COMM_NULL; /* MPI Communicator, from plist */ + subfiling_context_t *sf_context = NULL; + int64_t msg[3] = { + 0, + }; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI_NOINIT + + /* for now, set comm to MPI_COMM_WORLD. This is incorrect -- should use + * the communicator supplied with the file open, or a copy thereof. + */ + comm = MPI_COMM_WORLD; + + /* Barrier on entry */ +#if 0 /* JRM */ /* delete this eventually */ + HDfprintf(stdout, "\n\nH5FD__subfiling__truncate_sub_files: entering initial barrier.\n\n"); + HDfflush(stdout); +#endif /* JRM */ /* delete this eventually */ + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) + + if (NULL == (sf_context = (subfiling_context_t *)get__subfiling_object(context_id))) + HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "can't get subfile context") + + /* Test to see if this rank is running an I/O concentrator. */ + + if (sf_context->topology->rank_is_ioc) { + + int i; + int64_t subfile_eof; + int64_t num_full_stripes; + int64_t partial_stripe_len; +#ifndef NDEBUG + int64_t test_file_eof; +#endif /* NDEBUG */ + + /* if it is, first compute the sub-file EOF */ + + num_full_stripes = logical_file_eof / sf_context->sf_blocksize_per_stripe; + partial_stripe_len = logical_file_eof % sf_context->sf_blocksize_per_stripe; + + subfile_eof = num_full_stripes * sf_context->sf_stripe_size; + + if (sf_context->topology->subfile_rank < (partial_stripe_len / sf_context->sf_stripe_size)) { + + subfile_eof += sf_context->sf_stripe_size; + } + else if (sf_context->topology->subfile_rank == (partial_stripe_len / sf_context->sf_stripe_size)) { + + subfile_eof += partial_stripe_len % sf_context->sf_stripe_size; + } + + /* sanity check -- compute the file eof using the same mechanism used to + * compute the sub-file eof. Assert that the computed value and the + * actual value match. + * + * Do this only for debug builds -- probably delete this before release. + * + * JRM -- 12/15/21 + */ + +#ifndef NDEBUG + test_file_eof = 0; + + for (i = 0; i < sf_context->topology->n_io_concentrators; i++) { + + test_file_eof += num_full_stripes * sf_context->sf_stripe_size; + + if (i < (partial_stripe_len / sf_context->sf_stripe_size)) { + + test_file_eof += sf_context->sf_stripe_size; + } + else if (i == (partial_stripe_len / sf_context->sf_stripe_size)) { + + test_file_eof += partial_stripe_len % sf_context->sf_stripe_size; + } + } + HDassert(test_file_eof == logical_file_eof); +#endif /* NDEBUG */ + +#if 0 /* JRM */ + HDfprintf(stdout, "\nH5FD__subfiling__truncate_sub_files: eof / sf_eof = %lld/%lld\n\n", + (long long)logical_file_eof, (long long)subfile_eof); + HDfflush(stdout); +#endif /* JRM */ + + /* then direct the IOC to truncate the sub-file to the correct EOF */ + + msg[0] = subfile_eof; + msg[1] = 0; /* padding -- not used in this message */ + msg[2] = context_id; + + if (MPI_SUCCESS != (mpi_code = MPI_Send(msg, 3, MPI_INT64_T, sf_context->topology->subfile_rank, + TRUNC_OP, sf_context->sf_msg_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Send failed", mpi_code) + } + + /* Barrier on exit */ +#if 0 /* JRM */ /* delete this eventually */ + HDfprintf(stdout, "\n\nH5FD__subfiling__truncate_sub_files: entering final barrier.\n\n"); + HDfflush(stdout); +#endif /* JRM */ /* delete this eventually */ + if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code) + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5FD__subfiling__truncate_sub_files() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__subfiling__get_real_eof + * + * Note: This code should be moved -- most likely to the IOC + * code files. + * + * Purpose: Query each subfile to get its local EOF, and then used this + * data to calculate the actual EOF. + * + * Do this as follows: + * + * 1) allocate an array of int64_t of length equal to the + * the number of IOCs, and initialize all fields to -1. + * + * 2) Send each IOC a message requesting that sub-file's EOF. + * + * 3) Await reply from each IOC, storing the reply in + * the appropriate entry in the array allocated in 1. + * + * 4) After all IOCs have replied, compute the offset of + * each subfile in the logical file. Take the maximum + * of these values, and erport this value as the overall + * EOF. + * + * Note that this operation is not collective, and can return + * invalid data if other ranks perform writes while this + * operation is in progress. + * + * Return: SUCCEED/FAIL + * + * Programmer: JRM -- 1/18/22 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +herr_t +H5FD__subfiling__get_real_eof(int64_t *logical_eof_ptr, hid_t context_id) +{ + int i; + int reply_count; + int ioc_rank; + int mpi_code; /* MPI return code */ + int n_io_concentrators; /* copy of value in topology */ + MPI_Status status; + subfiling_context_t *sf_context = NULL; + int64_t msg[3] = {0, 0, 0}; + int64_t * sf_eofs = NULL; /* dynamically allocated array for sf EOFs */ + int64_t logical_eof = 0; + int64_t sf_logical_eof; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI_NOINIT + + HDassert(logical_eof_ptr); + + if (NULL == (sf_context = (subfiling_context_t *)get__subfiling_object(context_id))) + HGOTO_ERROR(H5E_FILE, H5E_BADVALUE, FAIL, "can't get subfile context") + + HDassert(sf_context->topology); + + n_io_concentrators = sf_context->topology->n_io_concentrators; + + HDassert(n_io_concentrators > 0); + + /* 1) allocate an array of int64_t of length equal to the + * the number of IOCs, and initialize all fields to -1. + */ + sf_eofs = (int64_t *)HDmalloc((size_t)n_io_concentrators * sizeof(int64_t)); + + if (sf_eofs == NULL) + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, FAIL, "can't allocate sub-file EOFs array."); + + for (i = 0; i < n_io_concentrators; i++) { + + sf_eofs[i] = -1; + } + + /* 2) Send each IOC an asynchronous message requesting that + * sub-file's EOF. + */ + msg[0] = 0; /* padding -- not used in this message */ + msg[1] = 0; /* padding -- not used in this message */ + msg[2] = context_id; + + for (i = 0; i < n_io_concentrators; i++) { + + ioc_rank = sf_context->topology->io_concentrator[i]; + + if (MPI_SUCCESS != + (mpi_code = MPI_Send(msg, 3, MPI_INT64_T, ioc_rank, GET_EOF_OP, sf_context->sf_msg_comm))) + HMPI_GOTO_ERROR(FAIL, "MPI_Send", mpi_code) + } + + /* 3) Await reply from each IOC, storing the reply in + * the appropriate entry in sf_eofs. + */ + reply_count = 0; + while (reply_count < n_io_concentrators) { + + if (MPI_SUCCESS != (mpi_code = MPI_Recv(msg, 3, MPI_INT64_T, MPI_ANY_SOURCE, GET_EOF_COMPLETED, + sf_context->sf_data_comm, &status))) { + + HMPI_GOTO_ERROR(FAIL, "MPI_Recv", mpi_code) + } + + ioc_rank = (int)msg[0]; + + HDassert(ioc_rank >= 0); + HDassert(ioc_rank < n_io_concentrators); + HDassert(sf_eofs[ioc_rank] == -1); + + sf_eofs[ioc_rank] = msg[1]; + + reply_count++; + } + + /* 4) After all IOCs have replied, compute the offset of + * each subfile in the logical file. Take the maximum + * of these values, and erport this value as the overall + * EOF. + */ + + for (i = 0; i < n_io_concentrators; i++) { + + /* compute number of complete stripes */ + sf_logical_eof = sf_eofs[i] / sf_context->sf_stripe_size; + + /* multiply by stripe size */ + sf_logical_eof *= sf_context->sf_stripe_size * n_io_concentrators; + + /* if the sub-file doesn't end on a stripe size boundary, must add in a partial stripe */ + if (sf_eofs[i] % sf_context->sf_stripe_size > 0) { + + /* add in the size of the partial stripe up to but not including this subfile */ + sf_logical_eof += i * sf_context->sf_stripe_size; + + /* finally, add in the number of bytes in the last partial stripe depth in the sub-file */ + sf_logical_eof += sf_eofs[i] % sf_context->sf_stripe_size; + } + + if (sf_logical_eof > logical_eof) { + + logical_eof = sf_logical_eof; + } + } + +#if 0 /* JRM */ /* delete this eventually */ + HDfprintf(stdout, "\n\nH5FD__subfiling__get_real_eof: logical_eof = %lld\n\n", logical_eof); + HDfflush(stdout); +#endif /* JRM */ /* delete this eventually */ + + *logical_eof_ptr = logical_eof; + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* H5FD__subfiling__get_real_eof() */ diff --git a/src/H5FDsubfile_mpi.c b/src/H5FDsubfile_mpi.c new file mode 100644 index 00000000000..5720565b0e7 --- /dev/null +++ b/src/H5FDsubfile_mpi.c @@ -0,0 +1,2846 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include "H5FDsubfiling.h" + +static int sf_close_file_count = 0; +static int sf_ops_after_first_close = 0; +static int sf_enable_directIO = 0; + +static int sf_write_ops = 0; +static double sf_pwrite_time = 0.0; +static double sf_write_wait_time = 0.0; + +static int sf_read_ops = 0; +static double sf_pread_time = 0.0; +static double sf_read_wait_time = 0.0; +static double sf_queue_delay_time = 0.0; + +/* The following is our basic template for a subfile filename. + * Note that eventually we shouldn't use 0_of_N since we + * intend to use the user defined HDF5 filename for a + * zeroth subfile as well as for all metadata. + */ +#define SF_FILENAME_TEMPLATE ".subfile_%ld_%0*d_of_%d" +static int *request_count_per_rank = NULL; + +atomic_int sf_workinprogress = 0; +atomic_int sf_work_pending = 0; +atomic_int sf_file_open_count = 0; +atomic_int sf_file_close_count = 0; +atomic_int sf_file_refcount = 0; +atomic_int sf_ioc_fini_refcount = 0; +atomic_int sf_ioc_ready = 0; +atomic_int sf_shutdown_flag = 0; +#if 1 /* JRM */ +/* sf_io_ops_pending is use to track the number of I/O operations pending so that we can wait + * until all I/O operations have been serviced before shutting down the worker thread pool. + * The value of this variable must always be non-negative. + */ +atomic_int sf_io_ops_pending = 0; +#endif /* JRM */ + +/* + * Structure definitions to enable async io completions + * We first define a structure which contains the basic + * input arguments for the functions which were originally + * invoked. See below. + */ +typedef struct _client_io_args { + int ioc; /* ID of the IO Concentrator handling this IO. */ + hid_t context_id; /* The context id provided for the read or write */ + int64_t offset; /* The file offset for the IO operation */ + int64_t elements; /* How many bytes */ + void * data; /* A pointer to the (contiguous) data segment */ + MPI_Request io_req; /* An MPI request to allow the code to loop while */ + /* making progress on multiple IOs */ +} io_args_t; + +/* pre-define */ +typedef struct _client_io_func io_func_t; + +struct _client_io_func { + int (*io_function)(void *this_io); /* pointer to a completion function */ + io_args_t io_args; /* arguments passed to the completion function */ + int pending; /* The function is complete (0) or pending (1)? */ +}; + +typedef struct _io_req { + struct _io_req *prev; /* A simple list structure containing completion */ + struct _io_req *next; /* functions. These should get removed as IO ops */ + io_func_t completion_func; /* are completed */ +} io_req_t; + +int n_io_pending = 0; +io_req_t pending_io_requests; + +typedef struct _client_xfer_info { + int64_t offset; + int64_t length; + int ioc_targets; + io_op_t op; +} client_xfer_info_t; + +typedef struct _xfer_info { + int64_t offset; + int64_t length; +} xfer_info_t; + +#define STAT_BLOCKSIZE 1024 +typedef struct _ioc_stats { + int read_index; + int read_size; + xfer_info_t *read_info; + int write_index; + int write_size; + xfer_info_t *write_info; +} ioc_stats_t; + +static ioc_stats_t ioc_xfer_records; + +int client_op_index = 0; +int client_op_size = 0; +client_xfer_info_t *client_ops = NULL; + +#if 1 /* JRM */ /* Find a better place for this */ +H5FD_ioc_io_queue_t io_queue_g = { + /* magic = */ H5FD_IOC__IO_Q_MAGIC, + /* q_head = */ NULL, + /* q_tail = */ NULL, + /* num_pending = */ 0, + /* num_in_progress = */ 0, + /* q_len = */ 0, + /* req_counter = */ 0, + /* q_mutex = */ + PTHREAD_MUTEX_INITIALIZER +#if H5FD_IOC__COLLECT_STATS + /* comma to allow further initializers */, + /* max_q_len = */ 0, + /* max_num_pending = */ 0, + /* max_num_in_progress = */ 0, + /* ind_read_requests = */ 0, + /* ind_write_requests = */ 0, + /* truncate_requests = */ 0, + /* requests_queued = */ 0, + /* requests_dispatched = */ 0, + /* requests_completed = */ 0 +#endif /* H5FD_IOC__COLLECT_STATS */ +}; +#endif /* JRM */ /* Find a better place for this */ + +/* const char *sf_subfile_prefix = "."; */ + +#if 0 /* JRM */ +#define MAX_WORK_PER_RANK 2 +#else /* JRM */ +#define MAX_WORK_PER_RANK 4 /* just to see if this changes anything */ +#endif /* JRM */ +#define K(n) ((n)*1024) +#define M(n) ((n) * (1024 * 1024)) +#define DEFAULT_STRIPE_SIZE M(32) +#define MAX_DEPTH 1024 + +/* +========================================= +Private functions +========================================= +*/ + +static inline void * +cast_to_void(const void *data) +{ + union { + const void *const_ptr_to_data; + void * ptr_to_data; + } eliminate_const_warning; + eliminate_const_warning.const_ptr_to_data = data; + return eliminate_const_warning.ptr_to_data; +} +static char *get_ioc_subfile_path(int ioc, int ioc_count, subfiling_context_t *sf_context); +static int async_completion(void *arg); + +static int +numDigits(int n) +{ + if (n < 0) + n = (n == INT_MIN) ? INT_MAX : -n; + if (n < 10) + return 1; + if (n < 100) + return 2; + if (n < 1000) + return 3; + if (n < 10000) + return 4; + if (n < 100000) + return 5; + if (n < 1000000) + return 6; + if (n < 10000000) + return 7; + if (n < 100000000) + return 8; + if (n < 1000000000) + return 9; + return 10; +} + +/* ===================================================================== */ +/* MPI_Datatype Creation functions. + * These are categorized by usage patterns, i.e. when data is sent to or + * received from and IOC, the initial data offset provided by the user + * may or may NOT start on a stripe boundary. Because this, the initial + * data segment to the selected IOC will often be less than 'stripe_size' + * in length. The purpose of these Datatype creation functions is to + * enable the gathering of all data from this client to the IOC target + * into a single MPI message. The MPI datatype will the be utilized by + * the sending function to pack data into a contiguous block of memory + * which enables the IOC to write to disk in an effective manner. + * ===================================================================== */ + +/*------------------------------------------------------------------------- + * Function: H5FD__create_first_mpi_type + * + * Purpose: Return an appropriate MPI datatype to represent the initial + * IO operation when reading or writing data to or from an IO + * Concentrator (IOC). + * + * If the 'first_io' is sufficient to complete the IO to the + * IOC, then the returned MPI datatype will simply be MPI_BYTE. + * For all other non-zero length IO operations, we create a + * derived MPI datatype using MPI_Type_indexed. The 'ioc_depth' + * input will define the number of blocks/disps pairs that are + * required to represent the desired IO operation. + * + * Return: The MPI_Datatype that will be used to send or receive data. + * Errors: MPI_Type_NULL if for any reason, the MPI_Datatype creation + * fails. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ + +/* Fill the output vectors 'io_offset', 'io_datasize' and 'io_f_offset' + * All calculations are in terms of bytes. + */ +static void +H5FD__create_first_mpi_type(subfiling_context_t *context, int ioc_depth, int64_t src_offset, + int64_t target_datasize, int64_t f_offset, int64_t *io_offset, + int64_t *io_datasize, int64_t *io_f_offset, int64_t first_io) +{ + int64_t stripe_size = context->sf_stripe_size; + int64_t blocksize_per_stripe = context->sf_blocksize_per_stripe; + int64_t offset_in_stripe = f_offset % stripe_size; + int64_t next_offset = blocksize_per_stripe - offset_in_stripe; + int64_t total_bytes = first_io; + + io_offset[0] = src_offset; + io_datasize[0] = first_io; + io_f_offset[0] = f_offset; +#ifdef VERBOSE + printf("[%s] 0: mem_offset=%ld, datasize=%ld, f_offset=%ld\n", __func__, src_offset, first_io, f_offset); + fflush(stdout); +#endif + if (first_io == target_datasize) { + return; + } + if (first_io) { + int k; + f_offset += (blocksize_per_stripe - offset_in_stripe); + for (k = 1; k <= ioc_depth; k++) { + io_offset[k] = next_offset; + io_datasize[k] = stripe_size; + io_f_offset[k] = f_offset; + total_bytes += stripe_size; +#ifdef VERBOSE + printf("[%s] %d: mem_offset=%ld, datasize=%ld, f_offset=%ld\n", __func__, k, next_offset, + stripe_size, f_offset); + fflush(stdout); +#endif + f_offset += context->sf_blocksize_per_stripe; + next_offset += context->sf_blocksize_per_stripe; + } + if (total_bytes != target_datasize) { + printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n", __func__, total_bytes, + target_datasize); + } + } + return; +} /* end H5FD__create_first_mpi_type() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__create_final_mpi_type + * + * Purpose: Return an appropriate MPI datatype to represent the final + * IO operation when reading or writing data to or from an IO + * Concentrator (IOC). + * + * The data that we're sending to an IO concentrator (IOC) + * contains the final collection of bytes. Other than that detail, + * this is pretty much like the typical' IO case, i.e. all block + * sizes are identical (execpt for the very last block). + *Furthermore, they all start at relative stripe offset of 0, in other words on + *a 'stripe_size' boundary. + * + * Return: The MPI_Datatype that will be used to send or receive data. + * Errors: MPI_Type_NULL if for any reason, the MPI_Datatype creation + * fails. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ + +/* Fill the output vectors 'io_offset', 'io_datasize' and 'io_f_offset' + * All calculations are in terms of bytes. + */ +static void +H5FD__create_final_mpi_type(subfiling_context_t *context, int ioc_depth, int64_t src_offset, + int64_t target_datasize, int64_t f_offset, int64_t *io_offset, + int64_t *io_datasize, int64_t *io_f_offset, int64_t last_io) +{ + int64_t stripe_size = context->sf_stripe_size; + int64_t blocksize_per_stripe = context->sf_blocksize_per_stripe; + int64_t next_offset = src_offset; + int64_t total_bytes = 0; + + if (last_io == target_datasize) { + io_offset[0] = src_offset; + io_f_offset[0] = f_offset; + io_datasize[0] = last_io; +#ifdef VERBOSE + printf("[%s] 0: mem_offset=%ld, datasize=%ld, f_offset=%ld\n", __func__, src_offset, last_io, + f_offset); + fflush(stdout); +#endif + return; + } + + if (last_io) { + int i, k; + for (k = 0, i = 1; i < ioc_depth; i++) { + io_offset[k] = next_offset; + io_datasize[k] = stripe_size; + io_f_offset[k] = f_offset; +#ifdef VERBOSE + printf("[%s] %d: mem_offset=%ld, datasize=%ld, f_offset=%ld\n", __func__, k, next_offset, + stripe_size, f_offset); + fflush(stdout); +#endif + k++; + total_bytes += stripe_size; + f_offset += blocksize_per_stripe; + next_offset += context->sf_blocksize_per_stripe; + } + + io_datasize[k] = last_io; + io_offset[k] = next_offset; + io_f_offset[k] = f_offset; + total_bytes += last_io; + + if (total_bytes != target_datasize) { + printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n", __func__, total_bytes, + target_datasize); + } + } + return; +} /* end H5FD__create_final_mpi_type() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__create_f_l_mpi_type + * + * Purpose: Return an appropriate MPI datatype which includes both the + * first and final IO data segments. + * + * A special case where the current IOC has both the first and + * final write blocks. This function is basically a merge of + * the first_mpi_type and final_mpi_type functions. + * + * Return: The MPI_Datatype that will be used to send or receive data. + * Errors: MPI_Type_NULL if for any reason, the MPI_Datatype creation + * fails. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ + +static void +H5FD__create_f_l_mpi_type(subfiling_context_t *context, int ioc_depth, int64_t src_offset, + int64_t target_datasize, int64_t f_offset, int64_t *io_offset, int64_t *io_datasize, + int64_t *io_f_offset, int64_t first_io, int64_t last_io) +{ + int64_t stripe_size = context->sf_stripe_size; + int64_t blocksize_per_stripe = context->sf_blocksize_per_stripe; + int64_t offset_in_stripe = f_offset % stripe_size; + int64_t next_offset = blocksize_per_stripe - offset_in_stripe; + int64_t total_bytes = first_io; + + io_offset[0] = src_offset; + io_datasize[0] = first_io; + io_f_offset[0] = f_offset; + +#ifdef VERBOSE + printf("[%s] 0: mem_offset=%ld, datasize=%ld, f_offset=%ld\n", __func__, src_offset, first_io, f_offset); + fflush(stdout); +#endif + if (total_bytes == target_datasize) { + return; + } + + if (total_bytes) { + int k; + f_offset += (blocksize_per_stripe - offset_in_stripe); + for (k = 1; k < ioc_depth; k++) { + io_offset[k] = next_offset; + io_datasize[k] = stripe_size; + io_f_offset[k] = f_offset; + total_bytes += stripe_size; +#ifdef VERBOSE + printf("[%s] %d: mem_offset=%ld, datasize=%ld, f_offset=%ld\n", __func__, k, next_offset, + stripe_size, f_offset); + fflush(stdout); +#endif + f_offset += blocksize_per_stripe; + next_offset += blocksize_per_stripe; + } + io_datasize[ioc_depth] = last_io; + io_f_offset[ioc_depth] = f_offset; + io_offset[ioc_depth] = next_offset; +#ifdef VERBOSE + printf("[%s] %d: mem_offset=%ld, datasize=%ld, f_offset=%ld\n", __func__, k, next_offset, last_io, + f_offset); + fflush(stdout); +#endif + total_bytes += last_io; + + if (total_bytes != target_datasize) { + printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n", __func__, total_bytes, + target_datasize); + } + } + return; +} /* end H5FD__create_f_l_mpi_type() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__create_mpi_uniform_type + * + * Purpose: Return an appropriate MPI datatype to represent the typical + * IO operation when reading or writing data to or from an IO + * Concentrator (IOC). + * + * Each data segment is of 'stripe_size' length and will be + * separated from a previous or following segment by + * 'sf_blocksize_per_stripe' bytes of data. + * + * Return: The MPI_Datatype that will be used to send or receive data. + * Errors: MPI_Type_NULL if for any reason, the MPI_Datatype creation + * fails. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +static void +H5FD__create_mpi_uniform_type(subfiling_context_t *context, int ioc_depth, int64_t src_offset, + int64_t target_datasize, int64_t f_offset, int64_t *io_offset, + int64_t *io_datasize, int64_t *io_f_offset) +{ + int64_t stripe_size = context->sf_stripe_size; + int64_t blocksize_per_stripe = context->sf_blocksize_per_stripe; + int64_t next_offset = src_offset + blocksize_per_stripe; + int64_t total_bytes = 0; + + io_offset[0] = src_offset; + io_datasize[0] = stripe_size; + io_f_offset[0] = f_offset; + if (target_datasize == 0) { +#if 0 + printf("[%s] 0: datasize=0\n", __func__); + fflush(stdout); +#endif + io_datasize[0] = 0; + return; + } + +#if 0 + printf("[%s] 0: mem_offset=%ld, datasize=%ld, f_offset=%ld\n", + __func__, src_offset, stripe_size, f_offset); + fflush(stdout); +#endif + + f_offset += blocksize_per_stripe; + total_bytes = stripe_size; + + if (target_datasize > stripe_size) { + int k; + for (k = 1; k < ioc_depth; k++) { + io_offset[k] = next_offset; + io_datasize[k] = stripe_size; + io_f_offset[k] = f_offset; +#if 0 + printf("[%s] %d: mem_offset=%ld, datasize=%ld, f_offset=%ld\n", + __func__, k, next_offset, stripe_size, f_offset); + fflush(stdout); +#endif + total_bytes += stripe_size; + f_offset += blocksize_per_stripe; + next_offset += blocksize_per_stripe; + } + + if (total_bytes != target_datasize) { + printf("Warning (%s): total_SUM(%ld) != target_bytes(%ld)\n", __func__, total_bytes, + target_datasize); + } + } + return; +} /* end H5FD__create_mpi_uniform_type() */ + +/*------------------------------------------------------------------------- + * Function: init__indep_io + * + * Purpose: Utility function to initialize the set of IO transactions + * used to communicate with IO concentrators for read and write + * IO operations. + * + * Return: A filled set of vectors. As a consequence of not allowing + * use of MPI derived datatypes in the VFD layer, we need to + * accommodate the possibility that large IO transactions will + * be required to use multiple IOs per IOC. + * + * Example: Using 4 IOCs, each with 1M stripe-depth; when + * presented an IO request for 8MB then at a minimum each IOC + * will require 2 IOs of 1MB each. Depending on the starting + * file offset, the 2 IOs can instead be 3... + * + * To fully describe the IO transactions for read and writes we + * we thus use a return type where each IOC vector element is + * instead a vector itself and has a vector length of which + * corresponds to the max number of IO transactions per IOC. + * In the example above, these vector lengths can be 2 or 3. + * The actual length is determined by the 'container_depth' + * variable. + * + * For IO operations which involve a subset of IO concentrators, + * the vector entries for the unused IOCs will have lengths of + * zero and MPI NULL datatypes. The 'container_depth' in this + * case will always be 1. + * + * Return value: The vector "depth" or max number of IOs per IOC. + * + * Errors: Cannot fail. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ + +int +init__indep_io(void *_sf_context, size_t maxdepth, int H5_ATTR_PARALLEL_UNUSED ioc_total, + int64_t *sf_source_data_offset, int64_t *sf_datasize, int64_t *sf_offset, int *first_index, + int *n_containers, int64_t offset, int64_t elements, int dtype_extent) +{ + subfiling_context_t *sf_context = _sf_context; + int container_count = sf_context->topology->n_io_concentrators; + int64_t stripe_size = sf_context->sf_stripe_size; + int64_t data_size = elements * dtype_extent; + + int64_t start_id = offset / stripe_size; + int64_t offset_in_stripe = offset % sf_context->sf_blocksize_per_stripe; + int64_t container_offset = offset % stripe_size; + int64_t start_length = MIN(data_size, (stripe_size - container_offset)); + int64_t start_row = start_id / container_count; + int64_t ioc_start = start_id % container_count; + int64_t final_offset = offset + data_size; + int64_t final_id = final_offset / stripe_size; + int64_t final_length = (start_length == data_size ? 0 : final_offset % stripe_size); + int64_t ioc_final = final_id % container_count; + int64_t container_bytes = 0, total_bytes = 0; + int64_t source_offset = 0; + + int row_id_start = (int)(start_id - ioc_start); + int row_id_final = (int)(final_id - ioc_final); + int i, k, depth = ((row_id_final - row_id_start) / container_count) + 1; + int container_id = (int)start_id; + int64_t row_offset = (int64_t)(start_row * stripe_size); + + *first_index = (int)ioc_start; + + /* Given the IO parameters, we loop thru the set of IOCs + * to determine the various vector components for each. + * Those IOCs whose datasize is zero (0), will not have + * IO requests passed to them. + */ + + for (i = 0, k = (int)ioc_start; i < container_count; i++) { + /* We use 'output_offset' as an index into a linear + * version of a 2D array. In 'C' the last subscript + * is the one that varies most rapidly. + * In our case, the 2D array is represented as + * array[ container_count ][ maxdepth ] + */ + size_t depthsize = maxdepth * sizeof(int64_t); /* ONLY used for memset */ + size_t output_offset = (size_t)(k)*maxdepth; + int container_depth = depth; + + hbool_t is_first = false, is_last = false; + int64_t *__sf_source_data_offset = sf_source_data_offset + output_offset; + int64_t *__sf_datasize = sf_datasize + output_offset; + int64_t *__sf_offset = sf_offset + output_offset; + + memset(__sf_source_data_offset, 0, depthsize); + memset(__sf_datasize, 0, depthsize); + memset(__sf_offset, 0, depthsize); + + container_bytes = 0; + + if (total_bytes == data_size) { + *n_containers = i; + return depth + 1; + } + if (total_bytes < data_size) { + if (k == ioc_start) { + is_first = true; + container_bytes = start_length; + container_depth--; /* Account for the start_length */ + if (ioc_final < ioc_start) { + container_depth--; + depth--; + } + } + if (k == ioc_final) { + is_last = true; + container_bytes += final_length; + if (container_depth) + container_depth--; /* Account for the final_length */ + if (depth) + depth--; + } + container_bytes += container_depth * stripe_size; + total_bytes += container_bytes; + } + + __sf_source_data_offset[0] = source_offset; + __sf_datasize[0] = container_bytes; + __sf_offset[0] = row_offset + offset_in_stripe; + + if (container_count == 1) { + } + else { + /* Fill the IO datatypes */ + if (is_first) { + if (is_last) { /* First + Last */ + H5FD__create_f_l_mpi_type(sf_context, container_depth + 1, source_offset, container_bytes, + row_offset + offset_in_stripe, __sf_source_data_offset, + __sf_datasize, __sf_offset, start_length, final_length); + } + else { /* First ONLY */ + H5FD__create_first_mpi_type(sf_context, container_depth, source_offset, container_bytes, + row_offset + offset_in_stripe, __sf_source_data_offset, + __sf_datasize, __sf_offset, start_length); + } + /* Move the memory pointer to the starting location + * for next IOC request. + */ + source_offset += start_length; + } + else if (is_last) { /* Last ONLY */ + H5FD__create_final_mpi_type(sf_context, container_depth, source_offset, container_bytes, + row_offset + offset_in_stripe, __sf_source_data_offset, + __sf_datasize, __sf_offset, final_length); + /* Probably not needed... */ + source_offset += stripe_size; + } + else { /* Everything else (uniform) */ + H5FD__create_mpi_uniform_type(sf_context, container_depth, source_offset, container_bytes, + row_offset + offset_in_stripe, __sf_source_data_offset, + __sf_datasize, __sf_offset); + source_offset += stripe_size; + } + } + + k++; + offset_in_stripe += __sf_datasize[0]; + container_id++; + + if (k == container_count) { + k = 0; + offset_in_stripe = 0; + depth = ((row_id_final - container_id) / container_count) + 1; + row_offset += sf_context->sf_blocksize_per_stripe; + } + } + if (total_bytes != data_size) { + printf("Error: total_bytes != data_size\n"); + } + + *n_containers = container_count; + return depth + 1; +} /* end init__indep_io() */ + +/*------------------------------------------------------------------------- + * Function: Internal read__independent_async + * + * Purpose: The IO operations can be striped across a selection of + * IO concentrators. The read and write independent calls + * compute the group of 1 or more IOCs and further create + * derived MPI datatypes when required by the size of the + * contiguous read or write requests. + * + * IOC(0) contains the logical data storage for file offset + * zero and all offsets that reside within modulo range of + * the subfiling stripe_size. + * + * We cycle through all 'n_io_conentrators' and send a + * descriptor to each IOC that has a non-zero sized IO + * request to fulfill. + * + * Sending descriptors to an IOC usually gets an ACK or + * NACK in response. For the read operations, we post + * asynch READs to receive the file data and wait until + * all pending operations have completed. + * + * Return: Success (0) or Faiure (non-zero) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +#define WORLD_SIZE(ctx) ((ctx)->topology->app_layout->world_size) +#define WORLD_RANK(ctx) ((ctx)->topology->app_layout->world_size) + +static int +read__independent_async(int n_io_concentrators, hid_t context_id, int64_t offset, int64_t elements, + int H5_ATTR_PARALLEL_UNUSED dtype_extent, void *data, io_req_t **io_req) +{ + int status = 0; + int64_t stripe_size, ioc_row, start_id, ioc_start, ioc_offset; + int * io_concentrator = NULL; + io_req_t *sf_io_request = NULL; + int64_t msg[3] = { + 0, + }; + + subfiling_context_t *sf_context = get__subfiling_object(context_id); + assert(sf_context != NULL); + + /* Calculate the IOC that we'll send the IO request to */ + stripe_size = sf_context->sf_stripe_size; + + start_id = offset / stripe_size; + ioc_row = start_id / n_io_concentrators; + ioc_offset = (offset % stripe_size) + (ioc_row * stripe_size); + + ioc_start = start_id % n_io_concentrators; + + io_concentrator = sf_context->topology->io_concentrator; + assert(io_concentrator != NULL); + + /* Make sure that we can return a request structure + * if everything is working correctly + */ + assert(io_req); + + /* Prepare an IO request. + * This gets sent to the ioc identified by the file offset + */ + msg[0] = elements; + msg[1] = ioc_offset; + msg[2] = context_id; +#ifdef VERBOSE + printf("[%s ioc(%ld)] elements=%ld, offset=%ld, file_offset=%ld\n", __func__, ioc_start, elements, offset, + ioc_offset); + fflush(stdout); +#endif + status = MPI_Send(msg, 3, MPI_INT64_T, io_concentrator[ioc_start], READ_INDEP, sf_context->sf_msg_comm); + + if (status != MPI_SUCCESS) { + int len; + char estring[MPI_MAX_ERROR_STRING]; + MPI_Error_string(status, estring, &len); + printf("[%d] ERROR! MPI_Send request header (%ld) " + "bytes to %d returned an error(%s)\n", + WORLD_RANK(sf_context), sizeof(msg), io_concentrator[ioc_start], estring); + fflush(stdout); + return -1; + } + + /* At this point in the new implementation, we should queue + * the async recv so that when the top level VFD tells us + * to complete all pending IO requests, we have all the info + * we need to accomplish that. + */ + sf_io_request = (io_req_t *)malloc(sizeof(io_req_t)); + assert(sf_io_request); + + sf_io_request->completion_func.io_args.ioc = (int)ioc_start; + sf_io_request->completion_func.io_args.context_id = context_id; + sf_io_request->completion_func.io_args.offset = offset; + sf_io_request->completion_func.io_args.elements = elements; + sf_io_request->completion_func.io_args.data = data; + sf_io_request->completion_func.io_args.io_req = MPI_REQUEST_NULL; + sf_io_request->completion_func.io_function = async_completion; + sf_io_request->completion_func.pending = 0; + + sf_io_request->prev = sf_io_request->next = NULL; + /* Start the actual data transfer */ + + status = MPI_Irecv(data, (int)elements, MPI_BYTE, io_concentrator[ioc_start], READ_INDEP_DATA, + sf_context->sf_data_comm, &sf_io_request->completion_func.io_args.io_req); + + if (status == MPI_SUCCESS) { + sf_io_request->completion_func.pending = 1; + *io_req = sf_io_request; + } + else { + puts("MPI_Irecv must have failed!"); + free(sf_io_request); + *io_req = NULL; + } + + return status; +} /* end read__independent_async() */ + +/*------------------------------------------------------------------------- + * Function: get_ioc_subfile_path + * + * Purpose: We provide a utility function to generate a subfiling + * filename from a template. While the user provides a + * name which will serve as the HDF5 file name, sub-filing + * files are related to the user filename via the filesystem + * inode identifier. The inode id can be utilized as a + * global unique identifier (GUID) which provides a + * grouping ID to easily distinguish subfiles. + * + * The inode_id is contained in the 'sf_context' structure. + * + * Return: A full filepath which should be copied, e.g. using strdup + *------------------------------------------------------------------------- + */ +static char * +get_ioc_subfile_path(int ioc, int ioc_count, subfiling_context_t *sf_context) +{ + static char filepath[PATH_MAX]; + char * subfile_dir = NULL; + char * prefix = sf_context->subfile_prefix; + + int numD = numDigits(ioc_count); + if (prefix != NULL) { + sprintf(filepath, "%s/" SF_FILENAME_TEMPLATE, prefix, sf_context->h5_file_id, numD, ioc, ioc_count); + } + else { + strcpy(filepath, sf_context->h5_filename); + subfile_dir = strrchr(filepath, '/'); + assert(subfile_dir); + sprintf(subfile_dir + 1, SF_FILENAME_TEMPLATE, sf_context->h5_file_id, numD, ioc, ioc_count); + } + return filepath; +} /* end get_ioc_subfile_path() */ + +/*------------------------------------------------------------------------- + * Utility functions in support of a first pass attempt at handling async + * IO. The working assumption is that reads and writes to a collection + * of IO Concentrators (IOCs) will proceed by stages. In the first stage, + * each MPI rank will get their individual IOs started by preping the IOC + * with a message which indicates (via the MPI tag) what operation is + * starting, along with the file offset, data size, and a context_id. + * The latter will be used to access the actual open file descriptor. + * + *------------------------------------------------------------------------- + * Function: progress_this_pending_io + * + * Purpose: In this initial example, we can progress an individual + * IO request which is described by the io_req_t input arg. + * + * Return: an integer status. Zero(0) indicates success. Negative + * values (-1) indicates an error. + *------------------------------------------------------------------------- + */ +static int +progress_this_pending_io(io_req_t *this_req) +{ + assert(this_req); + assert(this_req->completion_func.io_function); + return (*this_req->completion_func.io_function)(&this_req->completion_func); +} + +/*------------------------------------------------------------------------- + * Function: write_data + * + * Purpose: Given a io_func_t structure containing the function pointer + * and it's input arguments, we write the supplied data out + * asynchronous using MPI_Isend, to the appropriate IOC. + * + * Return: an integer status. Zero(0) indicates success. Negative + * values (-1) indicates an error. + *------------------------------------------------------------------------- + */ +static int +write_data(io_func_t *this_func) +{ + int ioc, status; + int64_t elements; + void * data; + int * io_concentrator = NULL; + subfiling_context_t *sf_context = NULL; + assert(this_func); + + sf_context = get__subfiling_object(this_func->io_args.context_id); + + assert(sf_context); + + io_concentrator = sf_context->topology->io_concentrator; + ioc = this_func->io_args.ioc; + + status = MPI_Isend(data, (int)elements, MPI_BYTE, io_concentrator[ioc], WRITE_INDEP_DATA, + sf_context->sf_data_comm, &this_func->io_args.io_req); + return status; +} + +/*------------------------------------------------------------------------- + * Function: async_completion + * + * Purpose: Given a single io_func_t structure containing the function + * pointer and it's input arguments and a single MPI_Request + * argument which needs to be completed, we make progress + * by calling MPI_Test. In this initial example, we loop + * until the request is completed as indicated by a non-zero + * flag variable. + * + * As we go further with the implementation, we anticipate that + * rather than testing a single request variable, we will + * deal with a collection of all pending IO requests (on + * this rank). + * + * Return: an integer status. Zero(0) indicates success. Negative + * values (-1) indicates an error. + *------------------------------------------------------------------------- + */ +static int +async_completion(void *arg) +{ + struct async_arg { + int n_reqs; + MPI_Request *sf_reqs; + } *in_progress = (struct async_arg *)arg; + + assert(arg); + int status, errors = 0; + int count = in_progress->n_reqs; + int n_waiting = count; + int indices[count]; + MPI_Status stats[count]; + useconds_t delay = 5; + + while (n_waiting) { + int i, ready = 0; + status = MPI_Testsome(count, in_progress->sf_reqs, &ready, indices, stats); + if (status != MPI_SUCCESS) { + int len; + char estring[MPI_MAX_ERROR_STRING]; + MPI_Error_string(status, estring, &len); + printf("[%s] MPI_ERROR! MPI_Testsome returned an error(%s)\n", __func__, estring); + fflush(stdout); + errors++; + return -1; + } + + if (ready == 0) { + usleep(delay); + } + + for (i = 0; i < ready; i++) { + n_waiting--; + } + } + return errors; +} + +/*------------------------------------------------------------------------- + * Function: Internal write__independent_async. + * + * Purpose: The IO operations can be striped across a selection of + * IO concentrators. The read and write independent calls + * compute the group of 1 or more IOCs and further create + * derived MPI datatypes when required by the size of the + * contiguous read or write requests. + * + * IOC(0) contains the logical data storage for file offset + * zero and all offsets that reside within modulo range of + * the subfiling stripe_size. + * + * We cycle through all 'n_io_conentrators' and send a + * descriptor to each IOC that has a non-zero sized IO + * request to fulfill. + * + * Sending descriptors to an IOC usually gets an ACK or + * NACK in response. For the write operations, we post + * asynch READs to receive ACKs from IOC ranks that have + * allocated memory receive the data to write to the + * subfile. Upon receiving an ACK, we send the actual + * user data to the IOC. + * + * Return: Success (0) or Faiure (non-zero) + * Errors: If MPI operations fail for some reason. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +#if 0 /* JRM */ /* original version */ +static int +write__independent_async(int n_io_concentrators, hid_t context_id, int64_t offset, int64_t elements, + int H5_ATTR_PARALLEL_UNUSED dtype_extent, const void *data, io_req_t **io_req) +{ + + int ack = 0, active_sends = 0, n_waiting = 0, status = 0; + int64_t stripe_size, ioc_row, start_id, ioc_start, ioc_offset; + int * io_concentrator = NULL; + io_req_t * sf_io_request = NULL; + MPI_Request ackrequest; + int64_t msg[3] = { + 0, + }; + + subfiling_context_t *sf_context = get__subfiling_object(context_id); + assert(sf_context != NULL); + + /* Calculate the IOC that we'll send the IO request to */ + stripe_size = sf_context->sf_stripe_size; + + start_id = offset / stripe_size; + ioc_row = start_id / n_io_concentrators; + ioc_offset = (offset % stripe_size) + (ioc_row * stripe_size); + ioc_start = start_id % n_io_concentrators; + + io_concentrator = sf_context->topology->io_concentrator; + assert(io_concentrator != NULL); + + /* Make sure that we can return a request structure + * if everything is working correctly + */ + assert(io_req); + + /* Prepare an IO request. + * This gets sent to the ioc identified by the file offset. + * (see above: Calculate the IOC)) + */ + msg[0] = elements; + msg[1] = ioc_offset; + msg[2] = context_id; +#ifdef VERBOSE + printf("[%s ioc(%ld)] elements=%ld, offset=%ld, file_offset=%ld\n", __func__, ioc_start, elements, offset, + ioc_offset); + fflush(stdout); +#endif + status = MPI_Send(msg, 3, MPI_INT64_T, io_concentrator[ioc_start], WRITE_INDEP, sf_context->sf_msg_comm); + if (status != MPI_SUCCESS) { + int len; + char estring[MPI_MAX_ERROR_STRING]; + MPI_Error_string(status, estring, &len); + printf("[%d] ERROR! MPI_Send of %ld bytes to %d returned an " + "error(%s)\n", + WORLD_RANK(sf_context), sizeof(msg), io_concentrator[ioc_start], estring); + fflush(stdout); + return -1; + } + else + active_sends++; + /* + * We wait for memory to be allocated on the target IOC so that we can + * start sending user data. Once memory is allocated, we will receive + * an ACK (or NACK) message from the IOC to allow us to proceed. + */ + status = MPI_Irecv(&ack, 1, MPI_INT, io_concentrator[ioc_start], WRITE_INDEP_ACK, + sf_context->sf_data_comm, &ackrequest); + + if (status != MPI_SUCCESS) { + printf("[%d %s] MPI_Irecv failed\n", WORLD_RANK(sf_context), __func__); + fflush(stdout); + return -1; + } + + n_waiting = active_sends; + + while (n_waiting) { + int flag = 0; + status = MPI_Test(&ackrequest, &flag, MPI_STATUS_IGNORE); + if (status == MPI_SUCCESS) { + if (flag == 0) + usleep(0); + else { + n_waiting--; + if (ack == 0) { /* NACK */ + printf("%s - Received NACK!\n", __func__); + } + } + } + } + + /* At this point in the new implementation, we should queue + * the async write so that when the top level VFD tells us + * to complete all pending IO requests, we have all the info + * we need to accomplish that. + */ + sf_io_request = (io_req_t *)malloc(sizeof(io_req_t)); + assert(sf_io_request); + + sf_io_request->completion_func.io_args.ioc = (int)ioc_start; + sf_io_request->completion_func.io_args.context_id = context_id; + sf_io_request->completion_func.io_args.offset = offset; + sf_io_request->completion_func.io_args.elements = elements; + sf_io_request->completion_func.io_args.data = cast_to_void(data); + sf_io_request->completion_func.io_args.io_req = MPI_REQUEST_NULL; + sf_io_request->completion_func.io_function = async_completion; + sf_io_request->completion_func.pending = 0; + + sf_io_request->prev = sf_io_request->next = NULL; + /* Start the actual data transfer */ + +#if 1 /* JRM */ /* experiment with MPI_Issend() */ + status = MPI_Isend(data, (int)elements, MPI_BYTE, io_concentrator[ioc_start], WRITE_INDEP_DATA, + sf_context->sf_data_comm, &sf_io_request->completion_func.io_args.io_req); +#else /* JRM */ +#if 1 /* JRM */ /* experiment with MPI_Send */ + status = MPI_Issend(data, (int)elements, MPI_BYTE, io_concentrator[ioc_start], WRITE_INDEP_DATA, + sf_context->sf_data_comm, &sf_io_request->completion_func.io_args.io_req); +#else /* JRM */ + status = MPI_Send(data, (int)elements, MPI_BYTE, io_concentrator[ioc_start], WRITE_INDEP_DATA, + sf_context->sf_data_comm); +#endif /* JRM */ +#endif /* JRM */ + + /* When we actually have the async IO support, + * the request should be queued before we + * return to the caller. + * Having queued the IO operation, we might want to + * get additional work started before allowing the + * queued IO requests to make further progress and/or + * to complete, so we just return to the caller. + */ + + if (status == MPI_SUCCESS) { + sf_io_request->completion_func.pending = 1; + *io_req = sf_io_request; + } + else { + puts("MPI_Isend must have failed!"); + free(sf_io_request); + *io_req = NULL; + } + return status; +} /* end write__independent_async() */ + +#else /* JRM */ /* modified to use IOC supplied tag for data send */ + +static int +write__independent_async(int n_io_concentrators, hid_t context_id, int64_t offset, int64_t elements, + int H5_ATTR_PARALLEL_UNUSED dtype_extent, const void *data, io_req_t **io_req) +{ + + int ack = 0, active_sends = 0, n_waiting = 0, status = 0; + int64_t stripe_size, ioc_row, start_id, ioc_start, ioc_offset; + int * io_concentrator = NULL; + io_req_t * sf_io_request = NULL; + MPI_Request ackrequest; + int64_t msg[3] = { + 0, + }; + + subfiling_context_t *sf_context = get__subfiling_object(context_id); + assert(sf_context != NULL); + + /* Calculate the IOC that we'll send the IO request to */ + stripe_size = sf_context->sf_stripe_size; + + start_id = offset / stripe_size; + ioc_row = start_id / n_io_concentrators; + ioc_offset = (offset % stripe_size) + (ioc_row * stripe_size); + ioc_start = start_id % n_io_concentrators; + + io_concentrator = sf_context->topology->io_concentrator; + assert(io_concentrator != NULL); + + /* Make sure that we can return a request structure + * if everything is working correctly + */ + assert(io_req); + + /* Prepare an IO request. + * This gets sent to the ioc identified by the file offset. + * (see above: Calculate the IOC)) + */ + msg[0] = elements; + msg[1] = ioc_offset; + msg[2] = context_id; +#ifdef VERBOSE + printf("[%s ioc(%ld)] elements=%ld, offset=%ld, file_offset=%ld\n", __func__, ioc_start, elements, offset, + ioc_offset); + fflush(stdout); +#endif + status = MPI_Send(msg, 3, MPI_INT64_T, io_concentrator[ioc_start], WRITE_INDEP, sf_context->sf_msg_comm); + if (status != MPI_SUCCESS) { + int len; + char estring[MPI_MAX_ERROR_STRING]; + MPI_Error_string(status, estring, &len); + printf("[%d] ERROR! MPI_Send of %ld bytes to %d returned an " + "error(%s)\n", + WORLD_RANK(sf_context), sizeof(msg), io_concentrator[ioc_start], estring); + fflush(stdout); + return -1; + } + else + active_sends++; + /* + * We wait for memory to be allocated on the target IOC so that we can + * start sending user data. Once memory is allocated, we will receive + * an ACK (or NACK) message from the IOC to allow us to proceed. + */ + /* On ACK, IOC will send tag to be used for data send -- need this to + * distinguish between multiple concurrent writes from a single rank. + */ + status = MPI_Irecv(&ack, 1, MPI_INT, io_concentrator[ioc_start], WRITE_INDEP_ACK, + sf_context->sf_data_comm, &ackrequest); + + if (status != MPI_SUCCESS) { + printf("[%d %s] MPI_Irecv failed\n", WORLD_RANK(sf_context), __func__); + fflush(stdout); + return -1; + } + + n_waiting = active_sends; + + while (n_waiting) { + int flag = 0; + status = MPI_Test(&ackrequest, &flag, MPI_STATUS_IGNORE); + if (status == MPI_SUCCESS) { + if (flag == 0) + usleep(0); + else { + n_waiting--; + if (ack == 0) { /* NACK */ + printf("%s - Received NACK!\n", __func__); + } + } + } + } + + /* At this point in the new implementation, we should queue + * the async write so that when the top level VFD tells us + * to complete all pending IO requests, we have all the info + * we need to accomplish that. + */ + sf_io_request = (io_req_t *)malloc(sizeof(io_req_t)); + assert(sf_io_request); + + sf_io_request->completion_func.io_args.ioc = (int)ioc_start; + sf_io_request->completion_func.io_args.context_id = context_id; + sf_io_request->completion_func.io_args.offset = offset; + sf_io_request->completion_func.io_args.elements = elements; + sf_io_request->completion_func.io_args.data = cast_to_void(data); + sf_io_request->completion_func.io_args.io_req = MPI_REQUEST_NULL; + sf_io_request->completion_func.io_function = async_completion; + sf_io_request->completion_func.pending = 0; + + sf_io_request->prev = sf_io_request->next = NULL; + /* Start the actual data transfer */ + +#if 1 /* JRM */ /* experiment with MPI_Issend() */ + /* use ack from IOC as the tag for the send */ + status = MPI_Isend(data, (int)elements, MPI_BYTE, io_concentrator[ioc_start], ack, + sf_context->sf_data_comm, &sf_io_request->completion_func.io_args.io_req); +#else /* JRM */ +#if 1 /* JRM */ /* experiment with MPI_Send */ + status = MPI_Issend(data, (int)elements, MPI_BYTE, io_concentrator[ioc_start], WRITE_INDEP_DATA, + sf_context->sf_data_comm, &sf_io_request->completion_func.io_args.io_req); +#else /* JRM */ + status = MPI_Send(data, (int)elements, MPI_BYTE, io_concentrator[ioc_start], WRITE_INDEP_DATA, + sf_context->sf_data_comm); +#endif /* JRM */ +#endif /* JRM */ + + /* When we actually have the async IO support, + * the request should be queued before we + * return to the caller. + * Having queued the IO operation, we might want to + * get additional work started before allowing the + * queued IO requests to make further progress and/or + * to complete, so we just return to the caller. + */ + + if (status == MPI_SUCCESS) { + sf_io_request->completion_func.pending = 1; + *io_req = sf_io_request; + } + else { + puts("MPI_Isend must have failed!"); + free(sf_io_request); + *io_req = NULL; + } + return status; +} /* end write__independent_async() */ + +#endif /* JRM */ /* modified to use IOC supplied tag for data send */ + +/* + * Function: H5FD__write_vector_internal + * + * Purpose: This function takes 'count' vector entries + * and initiates an asynch write operation for each. + * By asynchronous, we mean that MPI_Isends are utilized + * to communicate the write operations to the 'count' + * IO Concentrators. The calling function will have + * decomposed the actual user IO request into the + * component segments, each IO having a maximum size + * of "stripe_depth", which is recorded in the + * subfiling_context_t 'sf_context' structure. + * + * Return: SUCCEED if no errors, FAIL otherwise. + */ +herr_t +H5FD__write_vector_internal(hid_t h5_fid, hssize_t count, haddr_t addrs[], size_t sizes[], + const void *bufs[] /* in */) +{ + herr_t ret_value = SUCCEED; + hssize_t status = 0, k = 0; + hid_t sf_context_id = fid_map_to_context((uint64_t)h5_fid); + subfiling_context_t *sf_context = NULL; + io_req_t ** sf_async_reqs = NULL; + MPI_Request * active_reqs = NULL; + struct __mpi_req { + int n_reqs; + MPI_Request *active_reqs; + } *mpi_reqs = NULL; + + sf_context = get__subfiling_object(sf_context_id); + assert(sf_context != NULL); + + active_reqs = (MPI_Request *)calloc((size_t)(count + 2), sizeof(struct __mpi_req)); + assert(active_reqs); + + sf_async_reqs = (io_req_t **)calloc((size_t)count, sizeof(void *)); + assert(sf_async_reqs); + + /* + * Note: We allocated extra space in the active_requests (above). + * The extra should be enough for an integer plus a pointer. + */ + mpi_reqs = (struct __mpi_req *)&active_reqs[count]; + mpi_reqs->n_reqs = (int)count; + mpi_reqs->active_reqs = active_reqs; + + /* Each pass thru the following should queue an MPI write + * to a new IOC. Both the IOC selection and offset within the + * particular subfile are based on the combinatation of striping + * factors and the virtual file offset (addrs[k]). + */ + for (k = 0; k < count; k++) { + if (sizes[k] == 0) { + puts("Something wrong with the size argument: size is 0!"); + fflush(stdout); + } + status = + write__independent_async(sf_context->topology->n_io_concentrators, sf_context_id, + (int64_t)addrs[k], (int64_t)sizes[k], 1, bufs[k], &sf_async_reqs[k]); + if (status < 0) { + printf("%s - encountered an internal error!\n", __func__); + goto errors; + } + else { + mpi_reqs->active_reqs[k] = sf_async_reqs[k]->completion_func.io_args.io_req; + } + } + + /* Here, we should have queued 'count' async requests. + * We can can now try to complete those before returning + * to the caller for the next set of IO operations. + */ +#if 1 /* JRM */ /* experiment with synchronous send */ + if (sf_async_reqs[0]->completion_func.io_function) + ret_value = (*sf_async_reqs[0]->completion_func.io_function)(mpi_reqs); +#endif /* JRM */ + + if (active_reqs) + free(active_reqs); + + if (sf_async_reqs) { + for (k = 0; k < count; k++) { + if (sf_async_reqs[k]) { + free(sf_async_reqs[k]); + } + } + free(sf_async_reqs); + } + return ret_value; + +errors: + return FAIL; +} + +/* + * Refactored version of the original sf_read_vector() function. + * The H5FD__ioc_read_vector VFD call included additional 'hid_t dxpl' + * and 'H5FD_mem_t types[]'. These are now removed. + */ +herr_t +H5FD__read_vector_internal(hid_t h5_fid, hssize_t count, haddr_t addrs[], size_t sizes[], + void *bufs[] /* out */) +{ + herr_t ret_value = SUCCEED; + hssize_t status = 0, k = 0; + hid_t sf_context_id = fid_map_to_context((uint64_t)h5_fid); + subfiling_context_t *sf_context = NULL; + io_req_t ** sf_async_reqs = NULL; + MPI_Request * active_reqs = NULL; + struct __mpi_req { + int n_reqs; + MPI_Request *active_reqs; + } *mpi_reqs = NULL; + + sf_context = get__subfiling_object(sf_context_id); + assert(sf_context != NULL); + + active_reqs = (MPI_Request *)calloc((size_t)(count + 2), sizeof(struct __mpi_req)); + assert(active_reqs); + + sf_async_reqs = (io_req_t **)calloc((size_t)count, sizeof(void *)); + assert(sf_async_reqs); + + /* + * Note: We allocated extra space in the active_requests (above). + * The extra should be enough for an integer plus a pointer. + */ + mpi_reqs = (struct __mpi_req *)&active_reqs[count]; + mpi_reqs->n_reqs = (int)count; + mpi_reqs->active_reqs = active_reqs; + + for (k = 0; k < count; k++) { + status = read__independent_async(sf_context->topology->n_io_concentrators, sf_context_id, + (int64_t)addrs[k], (int64_t)sizes[k], 1, bufs[k], &sf_async_reqs[k]); + if (status < 0) { + printf("%s - encountered an internal error!\n", __func__); + goto errors; + } + else { + mpi_reqs->active_reqs[k] = sf_async_reqs[k]->completion_func.io_args.io_req; + } + } + /* Here, we should have queued 'count' async requests + * (one to each required IOC). + * + * We can can now try to complete those before returning + * to the caller for the next set of IO operations. + */ + if (sf_async_reqs[0]->completion_func.io_function) + ret_value = (*sf_async_reqs[0]->completion_func.io_function)(mpi_reqs); + + if (active_reqs) + free(active_reqs); + + if (sf_async_reqs) { + for (k = 0; k < count; k++) { + if (sf_async_reqs[k]) { + free(sf_async_reqs[k]); + } + } + free(sf_async_reqs); + } + return ret_value; + +errors: + return FAIL; +} + +#if 0 /* JRM */ /* delete this -- superseded version of sf_truncate */ +int +sf_truncate(hid_t h5_fid, haddr_t H5_ATTR_PARALLEL_UNUSED addr) +{ + hid_t sf_context_id = fid_map_to_context((uint64_t)h5_fid); + subfiling_context_t *sf_context = get__subfiling_object(sf_context_id); + + assert(sf_context != NULL); + return 0; +} +#endif /* JRM */ /* delete this */ + +#if 1 /* JRM */ /* delete this if all goes well */ +int +sf_shutdown_local_ioc(hid_t fid) +{ + hid_t context_id = fid_map_to_context((uint64_t)fid); + subfiling_context_t *sf_context = get__subfiling_object(context_id); + assert(sf_context != NULL); + if (sf_context->topology->rank_is_ioc) { + atomic_fetch_add(&sf_shutdown_flag, 1); + } + return 0; +} +#else /* JRM */ + +/*------------------------------------------------------------------------- + * Function: sf_shutdown_local_ioc() + * + * Purpose: Set the sf_shutdown_flag, and wait until the local + * I/O Concentrator shuts down. + * + * Return: Void + * + * Errors: None + * + * Programmer: JRM -- 10/26/21 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +void +sf_shutdown_local_ioc(hid_t fid) +{ + hid_t context_id = fid_map_to_context((uint64_t)fid); + subfiling_context_t *sf_context = get__subfiling_object(context_id); + assert(sf_context != NULL); + if (sf_context->topology->rank_is_ioc) { + atomic_fetch_add(&sf_shutdown_flag, 1); + } + return; + +} /* sf_shutdown_local_ioc() */ + +#endif /* JRM */ + +#if 0 /* JRM */ /* original version of ioc_main() */ +/*------------------------------------------------------------------------- + * Function: Public/IOC ioc_main + * + * Purpose: This is the principal function run by the IO Concentrator + * main thread. It remains within a loop until allowed to + * exit by means of setting the 'sf_shutdown_flag'. This + * usually accomplished as part of the file close operation. + * + * The function implements an asynchronous polling approach + * for incoming messages. These messages can be thought of + * as a primitive RPC which utilizes MPI TAGs to code and + * implement the desired subfiling functionality. + * + * As each incoming message is received, it get added to + * a queue for processing by a thread_pool thread. + * The message handlers are dispatched via the + * "handle_work_request" ftn (see H5FDsubfile_thread.c) + + * Subfiling is effectively a software RAID-0 implementation + * where having multiple IO Concentrators and independent + * subfiles is equated to the multiple disks and a true + * hardware base RAID implementation. + * + * IO Concentrators are ordered according to their MPI rank. + * In the simplest interpretation, IOC(0) will always contain + * the initial bytes of the logical disk image. Byte 0 of + * IOC(1) will contain the byte written to the logical disk + * offset "stripe_size" X IOC(number). + * + * Example: If the stripe size is defined to be 256K, then + * byte 0 of subfile(1) is at logical offset 262144 of the + * file. Similarly, byte 0 of subfile(2) represents the + * logical file offset = 524288. For logical files larger + * than 'N' X stripe_size, we simply "wrap around" back to + * subfile(0). The following shows the mapping of 30 + * logical blocks of data over 3 subfiles: + * +--------+--------+--------+--------+--------+--------+ + * | blk(0 )| blk(1) | blk(2 )| blk(3 )| blk(4 )| blk(5 )| + * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) | + * +--------+--------+--------+--------+--------+--------+ + * | blk(6 )| blk(7) | blk(8 )| blk(9 )| blk(10)| blk(11)| + * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) | + * +--------+--------+--------+--------+--------+--------+ + * | blk(12)| blk(13)| blk(14)| blk(15)| blk(16)| blk(17)| + * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) | + * +--------+--------+--------+--------+--------+--------+ + * | blk(18)| blk(19)| blk(20)| blk(21)| blk(22)| blk(23)| + * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) | + * +--------+--------+--------+--------+--------+--------+ + * | blk(24)| blk(25)| blk(26)| blk(27)| blk(28)| blk(29)| + * | IOC(0) | IOC(1) | IOC(2) | IOC(0) | IOC(1) | IOC(2) | + * +--------+--------+--------+--------+--------+--------+ + * + * Return: None + * Errors: None + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + *------------------------------------------------------------------------- + */ +int +ioc_main(int64_t context_id) +{ + int subfile_rank; + int flag, ret; + int max_work_depth; + int shutdown_requested; + MPI_Status status, msg_status; + sf_work_request_t * incoming_requests = NULL; + useconds_t delay = 20; + subfiling_context_t *context = get__subfiling_object(context_id); + double queue_start_time; + + assert(context != NULL); + /* We can't have opened any files at this point.. + * The file open approach has changed so that the normal + * application rank (hosting this thread) does the file open. + * We can simply utilize the file descriptor (which should now + * represent an open file). + */ + + subfile_rank = context->sf_group_rank; + + if (request_count_per_rank == NULL) { + request_count_per_rank = (int *)calloc((size_t)WORLD_SIZE(context), sizeof(int)); + assert(request_count_per_rank != NULL); + } + + max_work_depth = MAX(8, WORLD_SIZE(context) * MAX_WORK_PER_RANK); + incoming_requests = (sf_work_request_t *)calloc((size_t)(max_work_depth + 1), sizeof(sf_work_request_t)); + + /* Validate that the allocation succeeded */ + assert(incoming_requests != NULL); + + /* Initialize atomic vars */ + atomic_init(&sf_workinprogress, 0); + atomic_init(&sf_work_pending, 0); + atomic_init(&sf_file_close_count, 0); + atomic_init(&sf_file_refcount, 0); + atomic_init(&sf_ioc_fini_refcount, 0); + atomic_init(&sf_shutdown_flag, 0); + atomic_init(&sf_ioc_ready, 1); +#if 1 /* JRM */ + /* this variable is incremented by tpool_add_work(), and decremented when the + * received I/O request is completed. + * + * On shutdown, we must wait until this field is decremented to zero before + * taking down the thread pool. + */ + atomic_init(&sf_io_ops_pending, 0); +#endif /* JRM */ + shutdown_requested = 0; + +#if 0 /* JRM */ + while (!shutdown_requested || sf_work_pending) { +#else /* JRM */ + while ( ( ! shutdown_requested ) || ( 0 < atomic_load(&sf_io_ops_pending) ) || sf_work_pending) { +#endif /* JRM */ + flag = 0; + ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, context->sf_msg_comm, &flag, &status); + if ((ret == MPI_SUCCESS) && (flag != 0)) { + sf_work_request_t *msg = NULL; + int count; + int index = 0; + int request_size = (int)sizeof(sf_work_request_t); + int source = status.MPI_SOURCE; + int tag = status.MPI_TAG; + +#if 1 /* JRM */ + if ( ( tag != READ_INDEP ) && ( tag != WRITE_INDEP ) ) { + + HDprintf("\n\nioc_main: received non READ_INDEP / WRITE_INDEP mssg. tag = %d.\n\n", tag); + HDfflush(stdout); + } +#endif /* JRM */ + + MPI_Get_count(&status, MPI_BYTE, &count); + if (count > request_size) { + msg = (sf_work_request_t *)malloc((size_t)count); + ret = MPI_Recv(msg, count, MPI_BYTE, source, tag, context->sf_msg_comm, &msg_status); + } + else { + index = atomic_load(&sf_workinprogress); + ret = MPI_Recv(&incoming_requests[index], count, MPI_BYTE, source, tag, context->sf_msg_comm, + &msg_status); + if (MPI_SUCCESS == ret) { + int howmany = 0; + MPI_Get_count(&msg_status, MPI_BYTE, &howmany); + if (howmany != count) { + printf("%s: MPI_Recv completed %d bytes of %d\n", __func__, howmany, count); + fflush(stdout); + } + } + } + queue_start_time = MPI_Wtime(); + if (ret == MPI_SUCCESS) { + if (msg) { + printf("%s: non-std msg=(%p) from %d\n", __func__, (void *)msg, source); + fflush(stdout); + + msg->source = source; + msg->subfile_rank = subfile_rank; + msg->context_id = context->sf_context_id; + msg->start_time = queue_start_time; + tpool_add_work(msg); + } + else { + incoming_requests[index].tag = tag; + incoming_requests[index].source = source; + incoming_requests[index].subfile_rank = subfile_rank; + incoming_requests[index].start_time = queue_start_time; + incoming_requests[index].buffer = NULL; + tpool_add_work(&incoming_requests[index]); + if (index == max_work_depth - 1) { + atomic_init(&sf_workinprogress, 0); + } + else { + atomic_fetch_add(&sf_workinprogress, 1); // atomic + } + } + } + } + else { + usleep(delay); + } + shutdown_requested = atomic_load(&sf_shutdown_flag); + } + + if (incoming_requests) { + free(incoming_requests); + } + + /* Reset the shutdown flag */ + atomic_init(&sf_shutdown_flag, 0); + + return 0; +} + +#else /* JRM */ /* re-written version of ioc_main() */ + +int +ioc_main(int64_t context_id) +{ + int subfile_rank; + int flag, ret; + int max_work_depth; + int shutdown_requested; + MPI_Status status, msg_status; + sf_work_request_t wk_req; + useconds_t delay = 20; + subfiling_context_t *context = get__subfiling_object(context_id); + double queue_start_time; + +#if 0 /* JRM */ + HDfprintf(stdout, "\n\nioc_main: entering.\n\n"); + HDfflush(stdout); +#endif /* JRM */ + + assert(context != NULL); + /* We can't have opened any files at this point.. + * The file open approach has changed so that the normal + * application rank (hosting this thread) does the file open. + * We can simply utilize the file descriptor (which should now + * represent an open file). + */ + + subfile_rank = context->sf_group_rank; + + /* zero out the wk_req, since the received message will typically be smaller + * than sizeof(sf_work_request_t). + */ + HDmemset(&wk_req, 0, sizeof(sf_work_request_t)); + + /* Initialize atomic vars */ + /* JRM */ /* delete most of these? */ + atomic_init(&sf_workinprogress, 0); +#if 1 /* JRM */ + atomic_init(&sf_work_pending, 0); +#endif /* JRM */ + atomic_init(&sf_file_close_count, 0); + atomic_init(&sf_file_refcount, 0); + atomic_init(&sf_ioc_fini_refcount, 0); + atomic_init(&sf_shutdown_flag, 0); +#if 1 /* JRM */ + /* this variable is incremented by H5FD_ioc__queue_io_q_entry() when work + * is added to the I/O request queue, and decremented by H5FD_ioc__complete_io_q_entry() + * when an I/O request is completed and removed from the queue.. + * + * On shutdown, we must wait until this field is decremented to zero before + * taking down the thread pool. + * + * Note that this is a convenience variable -- we could use io_queue_g.q_len instead. + * However, accessing this field requires locking io_queue_g.q_mutex. + */ +#if 0 /* JRM */ + HDfprintf(stdout, "\n\nioc_main: setting sf_io_ops_pending to zero. sf_io_ops_pending = %d.\n\n", + atomic_load(&sf_io_ops_pending)); + HDfflush(stdout); +#endif /* JRM */ + atomic_init(&sf_io_ops_pending, 0); +#endif /* JRM */ + /* tell initialize_ioc_threads() that ioc_main() is ready to enter its main loop */ + atomic_init(&sf_ioc_ready, 1); + shutdown_requested = 0; + + while ((!shutdown_requested) || (0 < atomic_load(&sf_io_ops_pending)) +#if 1 /* JRM */ + || (0 < atomic_load(&sf_work_pending)) +#endif /* JRM */ + ) { + flag = 0; + ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, context->sf_msg_comm, &flag, &status); + if ((ret == MPI_SUCCESS) && (flag != 0)) { + sf_work_request_t *msg = NULL; + int count; + int index = 0; + int request_size = (int)sizeof(sf_work_request_t); + int source = status.MPI_SOURCE; + int tag = status.MPI_TAG; + +#if 1 /* JRM */ + if ((tag != READ_INDEP) && (tag != WRITE_INDEP) && (tag != TRUNC_OP) && (tag != GET_EOF_OP)) { + + HDprintf("\n\nioc_main: received non READ_INDEP / WRITE_INDEP / TRUNC_OP / GET_EOF_OP mssg. " + "tag = %d.\n\n", + tag); + HDfflush(stdout); + } +#endif /* JRM */ + + MPI_Get_count(&status, MPI_BYTE, &count); + + /* convert this assert to a proper error message once we decide how to handle error + * reporting from the I/O concentrator. + */ + HDassert(count <= sizeof(sf_work_request_t)); + + /* zero out the wk_req, since the received message will typically be smaller + * than sizeof(sf_work_request_t). + */ + HDmemset(&wk_req, 0, sizeof(sf_work_request_t)); + + ret = MPI_Recv(&wk_req, count, MPI_BYTE, source, tag, context->sf_msg_comm, &msg_status); + + if (MPI_SUCCESS == ret) { + + int howmany = 0; + + MPI_Get_count(&msg_status, MPI_BYTE, &howmany); + + if (howmany != count) { + printf("%s: MPI_Recv completed %d bytes of %d\n", __func__, howmany, count); + fflush(stdout); + } + } + + queue_start_time = MPI_Wtime(); + + if (ret == MPI_SUCCESS) { + + int curr_io_ops_pending; + + wk_req.tag = tag; + wk_req.source = source; + wk_req.subfile_rank = subfile_rank; + wk_req.start_time = queue_start_time; + wk_req.buffer = NULL; + + H5FD_ioc__queue_io_q_entry(&wk_req); + + HDassert(atomic_load(&sf_io_ops_pending) >= 0); + + H5FD_ioc__dispatch_elegible_io_q_entries(); + } + } + else { + usleep(delay); + } + shutdown_requested = atomic_load(&sf_shutdown_flag); + } + + /* Reset the shutdown flag */ + atomic_init(&sf_shutdown_flag, 0); + +#if 0 /* JRM */ + HDfprintf(stdout, "\n\nioc_main: exiting.\n\n"); + HDfflush(stdout); +#endif /* JRM */ + + return 0; + +} /* ioc_main() */ + +#endif /* JRM */ /* re-written version of ioc_main() */ + +/* +========================================= +Private helper functions +========================================= +*/ + +#if 0 /* JRM */ /* original version */ +static int +send_ack__(int target, int subfile_rank, int tag, MPI_Comm comm) +{ + int ack = 1; + int ret = MPI_Send(&ack, 1, MPI_INT, target, tag, comm); +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, "[ioc(%d): Sending ACK to MPI_rank(%d)\n", subfile_rank, target); + } + } +#endif + return ret; +} +#else /* JRM */ /* version modified to send expected data send tag */ + +static int +send_ack__(int target, int subfile_rank, int tag, MPI_Comm comm, int ack) +{ + + HDassert(ack > 0); + + int ret = MPI_Send(&ack, 1, MPI_INT, target, tag, comm); +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, "[ioc(%d): Sending ACK to MPI_rank(%d)\n", subfile_rank, target); + } + } +#endif + return ret; + +} /* send_ack__() */ + +#endif /* JRM */ /* version modified to send expected data send tag */ + +static int +send_nack__(int target, int subfile_rank, int tag, MPI_Comm comm) +{ + int nack = 0; + int ret = MPI_Send(&nack, 1, MPI_INT, target, tag, comm); + +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, "[ioc(%d): Sending NACK to MPI_rank(%d)\n", subfile_rank, target); + } + } +#endif + return ret; +} + +/* +========================================= +queue_xxx functions that should be run +from the thread pool threads... +========================================= +*/ + +/*------------------------------------------------------------------------- + * Function: Public/IOC queue_write_indep + * + * Purpose: Implement the IOC independent write function. The + * function is invoked as a result of the IOC receiving the + * "header"/RPC. What remains is to allocate memory for the + * data sent by the client and then write the data to our + * subfile. We utilize pwrite for the actual file writing. + * File flushing is done at file close. + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +#if 0 /* JRM */ /* original version */ +int +queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) +{ + int fd; +#if 1 /* JRM */ + int actual_bytes_received; +#endif /* JRM */ + char * recv_buffer = NULL; + int ret = MPI_SUCCESS; + MPI_Status msg_status; + int64_t data_size = msg->header[0]; + int64_t file_offset = msg->header[1]; + int64_t file_context_id = msg->header[2]; + double t_start, t_end; + double t_write, t_wait, t_queue_delay; + subfiling_context_t *sf_context = get__subfiling_object(file_context_id); + int64_t stripe_id = file_offset + data_size; + haddr_t sf_eof; + assert(sf_context != NULL); + + sf_eof = (haddr_t)(stripe_id % sf_context->sf_stripe_size); + stripe_id /= sf_context->sf_stripe_size; + sf_eof += (haddr_t)((stripe_id * sf_context->sf_blocksize_per_stripe) + sf_context->sf_base_addr); + + /* flag that we've attempted to write data to the file */ + sf_context->sf_write_count++; + /* For debugging performance */ + sf_write_ops++; + + t_start = MPI_Wtime(); + t_queue_delay = t_start - msg->start_time; + +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, + "[ioc(%d) %s]: msg from %d: datasize=%ld\toffset=%ld, " + "queue_delay = %lf seconds\n", + subfile_rank, __func__, source, data_size, file_offset, t_queue_delay); + } + } +#endif + + if (recv_buffer == NULL) { + if ((recv_buffer = (char *)malloc((size_t)data_size)) == NULL) { + perror("malloc"); + send_nack__(source, subfile_rank, WRITE_INDEP_ACK, comm); + return -1; + } + } + + send_ack__(source, subfile_rank, WRITE_INDEP_ACK, comm); + ret = MPI_Recv(recv_buffer, (int)data_size, MPI_BYTE, source, WRITE_INDEP_DATA, comm, &msg_status); + +#if 1 /* JRM */ + if ( MPI_SUCCESS != MPI_Get_count(&msg_status, MPI_BYTE, &actual_bytes_received) ) { + + HDprintf("\n\nqueue_write_indep(): can't get actual bytes receive.\n\n"); + HDfflush(stdout); + + } else if ( actual_bytes_received != data_size ) { + + HDprintf("\n\nqueue_write_indep(): message size mismatch -- expected = %ld, actual = %d.\n\n", + data_size, actual_bytes_received); + HDfflush(stdout); + + } +#endif /* JRM */ + + t_end = MPI_Wtime(); + t_wait = t_end - t_start; + sf_write_wait_time += t_wait; + t_start = t_end; +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, "[ioc(%d) %s] MPI_Recv(%ld bytes, from = %d) status = %d\n", subfile_rank, + __func__, data_size, source, ret); + } + } +#endif + + if (ret != MPI_SUCCESS) { + int len; + char estring[MPI_MAX_ERROR_STRING]; + MPI_Error_string(ret, estring, &len); + printf("[ioc(%d) %s] MPI_ERROR(%d)! MPI_Recv of %ld bytes from %d " + "returned an error(%s)\n", + subfile_rank, __func__, msg_status.MPI_ERROR, data_size, source, estring); + fflush(stdout); + return ret; + } + + if (msg->serialize) + ioc__wait_for_serialize(msg); + + fd = sf_context->sf_fid; + + if (fd < 0) { + printf("[ioc(%d)] WARNING: %s called while subfile_fid = %d (closed)\n", subfile_rank, __func__, fd); + fflush(stdout); + } + else { + if (sf_write_data(fd, file_offset, recv_buffer, data_size, subfile_rank) < 0) { + free(recv_buffer); + recv_buffer = NULL; + printf("[ioc(%d) %s] sf_write_data returned an error!\n", subfile_rank, __func__); + fflush(stdout); + return -1; + } + t_end = MPI_Wtime(); + t_write = t_end - t_start; + sf_pwrite_time += t_write; + } + + sf_queue_delay_time += t_queue_delay; + + /* Done... */ + if (sf_eof > sf_context->sf_eof) + sf_context->sf_eof = sf_eof; + +#ifdef VERBOSE + printf("[ioc(%d)] %s local sf_eof = %ld sf_context=%p\n", subfile_rank, __func__, sf_context->sf_eof, + (void *)sf_context); + fflush(stdout); +#endif + if (recv_buffer) { + free(recv_buffer); + } + return 0; +} + +#else /* JRM */ /* version modified for new dispatch code */ + +int +queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm, int counter) +{ + int fd; +#if 1 /* JRM */ + int actual_bytes_received; +#endif /* JRM */ + int rcv_tag = ((counter & 0xFFFF) << 12) | WRITE_INDEP_DATA; + char * recv_buffer = NULL; + int ret = MPI_SUCCESS; + MPI_Status msg_status; + int64_t data_size = msg->header[0]; + int64_t file_offset = msg->header[1]; + int64_t file_context_id = msg->header[2]; + double t_start, t_end; + double t_write, t_wait, t_queue_delay; + subfiling_context_t *sf_context = get__subfiling_object(file_context_id); + int64_t stripe_id = file_offset + data_size; + haddr_t sf_eof; + assert(sf_context != NULL); + + sf_eof = (haddr_t)(stripe_id % sf_context->sf_stripe_size); + stripe_id /= sf_context->sf_stripe_size; + sf_eof += (haddr_t)((stripe_id * sf_context->sf_blocksize_per_stripe) + sf_context->sf_base_addr); + + /* flag that we've attempted to write data to the file */ + sf_context->sf_write_count++; + /* For debugging performance */ + sf_write_ops++; + + t_start = MPI_Wtime(); + t_queue_delay = t_start - msg->start_time; + +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, + "[ioc(%d) %s]: msg from %d: datasize=%ld\toffset=%ld, " + "queue_delay = %lf seconds\n", + subfile_rank, __func__, source, data_size, file_offset, t_queue_delay); + } + } +#endif + + if (recv_buffer == NULL) { + if ((recv_buffer = (char *)malloc((size_t)data_size)) == NULL) { + perror("malloc"); + send_nack__(source, subfile_rank, WRITE_INDEP_ACK, comm); + return -1; + } + } + + send_ack__(source, subfile_rank, WRITE_INDEP_ACK, comm, rcv_tag); + ret = MPI_Recv(recv_buffer, (int)data_size, MPI_BYTE, source, rcv_tag, comm, &msg_status); + +#if 1 /* JRM */ + if (MPI_SUCCESS != MPI_Get_count(&msg_status, MPI_BYTE, &actual_bytes_received)) { + + HDprintf("\n\nqueue_write_indep(): can't get actual bytes receive.\n\n"); + HDfflush(stdout); + } + else if (actual_bytes_received != data_size) { + + HDprintf("\n\nqueue_write_indep(): message size mismatch -- expected = %ld, actual = %d.\n\n", + data_size, actual_bytes_received); + HDfflush(stdout); + } +#endif /* JRM */ + + t_end = MPI_Wtime(); + t_wait = t_end - t_start; + sf_write_wait_time += t_wait; + t_start = t_end; +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + fprintf(sf_logfile, "[ioc(%d) %s] MPI_Recv(%ld bytes, from = %d) status = %d\n", subfile_rank, + __func__, data_size, source, ret); + } + } +#endif + + if (ret != MPI_SUCCESS) { + int len; + char estring[MPI_MAX_ERROR_STRING]; + MPI_Error_string(ret, estring, &len); + printf("[ioc(%d) %s] MPI_ERROR(%d)! MPI_Recv of %ld bytes from %d " + "returned an error(%s)\n", + subfile_rank, __func__, msg_status.MPI_ERROR, data_size, source, estring); + fflush(stdout); + return ret; + } + + fd = sf_context->sf_fid; + + if (fd < 0) { + printf("[ioc(%d)] WARNING: %s called while subfile_fid = %d (closed)\n", subfile_rank, __func__, fd); + fflush(stdout); + } + else { + if (sf_write_data(fd, file_offset, recv_buffer, data_size, subfile_rank) < 0) { + free(recv_buffer); + recv_buffer = NULL; + printf("[ioc(%d) %s] sf_write_data returned an error!\n", subfile_rank, __func__); + fflush(stdout); + return -1; + } + t_end = MPI_Wtime(); + t_write = t_end - t_start; + sf_pwrite_time += t_write; + } + + sf_queue_delay_time += t_queue_delay; + + /* Done... */ + if (sf_eof > sf_context->sf_eof) + sf_context->sf_eof = sf_eof; + +#ifdef VERBOSE + printf("[ioc(%d)] %s local sf_eof = %ld sf_context=%p\n", subfile_rank, __func__, sf_context->sf_eof, + (void *)sf_context); + fflush(stdout); +#endif + if (recv_buffer) { + free(recv_buffer); + } + return 0; + +} /* queue_write_indep() */ + +#endif /* JRM */ /* version modified for new dispatch code */ + +/*------------------------------------------------------------------------- + * Function: Public/IOC queue_read_indep + * + * Purpose: Implement the IOC independent read function. The + * function is invoked as a result of the IOC receiving the + * "header"/RPC. What remains is to allocate memory for + * reading the data and then to send this to the client. + * We utilize pread for the actual file reading. + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +#if 0 /* JRM */ /* original version */ +int +queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) +{ + int fd; + char * send_buffer = NULL; + int ret = MPI_SUCCESS; + int64_t data_size = msg->header[0]; + int64_t file_offset = msg->header[1]; + int64_t file_context_id = msg->header[2]; + double t_start, t_end; + double t_read, t_queue_delay; + + subfiling_context_t *sf_context = get__subfiling_object(file_context_id); + assert(sf_context != NULL); + + sf_context->sf_read_count++; + /* For debugging performance */ + sf_read_ops++; + + t_start = MPI_Wtime(); + t_queue_delay = t_start - msg->start_time; + + fd = sf_context->sf_fid; + if (fd < 0) { + printf("[ioc(%d) %s] subfile(%d) file descriptor not valid\n", subfile_rank, __func__, fd); + return -1; + } + +#ifndef NDEBUG + if (sf_verbose_flag && (sf_logfile != NULL)) { + fprintf(sf_logfile, + "[ioc(%d) %s] msg from %d: datasize=%ld\toffset=%ld " + "queue_delay=%lf seconds\n", + subfile_rank, __func__, source, data_size, file_offset, t_queue_delay); + } +#endif + if ((send_buffer = (char *)malloc((size_t)data_size)) == NULL) { + perror("malloc"); + return -1; + } + + if (sf_read_data(fd, file_offset, send_buffer, data_size, subfile_rank) < 0) { + printf("[%d] %s - sf_read_data fd=%d for source(%d) returned an error!\n", subfile_rank, __func__, fd, + source); + fflush(stdout); + /* + * Should send a zero(0) byte message to the client to prevent + * it from hanging... + */ + MPI_Send(send_buffer, 0, MPI_BYTE, source, READ_INDEP_DATA, comm); + free(send_buffer); + return -1; + } + + ret = MPI_Send(send_buffer, (int)data_size, MPI_BYTE, source, READ_INDEP_DATA, comm); + if (ret != MPI_SUCCESS) { + int len; + char estring[MPI_MAX_ERROR_STRING]; + MPI_Error_string(ret, estring, &len); + printf("[ioc(%d)] ERROR! MPI_Send of %ld bytes to %d returned an " + "error(%s)\n", + subfile_rank, data_size, source, estring); + fflush(stdout); + return ret; + } + t_end = MPI_Wtime(); + t_read = t_end - t_start; + sf_pread_time += t_read; + sf_queue_delay_time += t_queue_delay; + +#ifndef NDEBUG + if (sf_verbose_flag && (sf_logfile != NULL)) { + fprintf(sf_logfile, "[ioc(%d)] MPI_Send to source(%d) completed\n", subfile_rank, source); + } +#endif + + if (send_buffer) { + free(send_buffer); + send_buffer = NULL; + } + + return 0; +} /* end queue_read_indep() */ + +#else /* JRM */ /* version modified for new dispatch code */ + +int +queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) +{ + int fd; + char * send_buffer = NULL; + int ret = MPI_SUCCESS; + int64_t data_size = msg->header[0]; + int64_t file_offset = msg->header[1]; + int64_t file_context_id = msg->header[2]; + double t_start, t_end; + double t_read, t_queue_delay; + + subfiling_context_t *sf_context = get__subfiling_object(file_context_id); + assert(sf_context != NULL); + + sf_context->sf_read_count++; + /* For debugging performance */ + sf_read_ops++; + + t_start = MPI_Wtime(); + t_queue_delay = t_start - msg->start_time; + + fd = sf_context->sf_fid; + if (fd < 0) { + printf("[ioc(%d) %s] subfile(%d) file descriptor not valid\n", subfile_rank, __func__, fd); + return -1; + } + +#ifndef NDEBUG + if (sf_verbose_flag && (sf_logfile != NULL)) { + fprintf(sf_logfile, + "[ioc(%d) %s] msg from %d: datasize=%ld\toffset=%ld " + "queue_delay=%lf seconds\n", + subfile_rank, __func__, source, data_size, file_offset, t_queue_delay); + } +#endif + if ((send_buffer = (char *)malloc((size_t)data_size)) == NULL) { + perror("malloc"); + return -1; + } + + if (sf_read_data(fd, file_offset, send_buffer, data_size, subfile_rank) < 0) { + printf("[%d] %s - sf_read_data fd=%d for source(%d) returned an error!\n", subfile_rank, __func__, fd, + source); + fflush(stdout); + /* + * Should send a zero(0) byte message to the client to prevent + * it from hanging... + */ + MPI_Send(send_buffer, 0, MPI_BYTE, source, READ_INDEP_DATA, comm); + free(send_buffer); + return -1; + } + + ret = MPI_Send(send_buffer, (int)data_size, MPI_BYTE, source, READ_INDEP_DATA, comm); + if (ret != MPI_SUCCESS) { + int len; + char estring[MPI_MAX_ERROR_STRING]; + MPI_Error_string(ret, estring, &len); + printf("[ioc(%d)] ERROR! MPI_Send of %ld bytes to %d returned an " + "error(%s)\n", + subfile_rank, data_size, source, estring); + fflush(stdout); + return ret; + } + t_end = MPI_Wtime(); + t_read = t_end - t_start; + sf_pread_time += t_read; + sf_queue_delay_time += t_queue_delay; + +#ifndef NDEBUG + if (sf_verbose_flag && (sf_logfile != NULL)) { + fprintf(sf_logfile, "[ioc(%d)] MPI_Send to source(%d) completed\n", subfile_rank, source); + } +#endif + + if (send_buffer) { + free(send_buffer); + send_buffer = NULL; + } + + return 0; +} /* end queue_read_indep() */ + +#endif /* JRM */ /* version modified for new dispatch code */ + +/* --------------------------------------------------- + * Helper function for subfiling_open_file() see below + * Subfiles should be located in the same directory + * as the HDF5 file unless the user has provided + * an alternate directory name as indicated by the + * sf_context->subfile_prefix argument. + * ---------------------------------------------------*/ +static void +get__subfile_name(subfiling_context_t *sf_context, int64_t h5_file_id, int subfile_rank, char **_basename, + char **_subfile_dir, char *filepath) +{ + char *prefix = NULL, *subfile_dir = NULL; + char *base = NULL; + int n_io_concentrators = sf_context->topology->n_io_concentrators; + + /* We require this to be non-null */ + HDassert(sf_context); + + prefix = (char *)malloc(PATH_MAX); + HDassert(prefix); + + /* Under normal operation, we co-locate subfiles + * with the HDF5 file + */ + strcpy(prefix, sf_context->h5_filename); + base = basename(prefix); + *_basename = strdup(base); + + if (sf_context->subfile_prefix == NULL) { + subfile_dir = dirname(prefix); + *_subfile_dir = strdup(subfile_dir); + } + else { + /* Note: Users may specify a directory name which is inaccessible + * from where the current is running. In particular, "node-local" + * storage is not uniformly available to all processes. + * We would like to check if the user pathname unavailable and + * if so, we could default to creating the subfiles in the + * current directory. (?) + */ + *_subfile_dir = strdup(sf_context->subfile_prefix); + } + + /* The subfile naming should produce files of the following form: + * If we assume the HDF5 file is named ABC.h5, then subfiles + * will have names: + * ABC.h5.subfile__00_of_20, + * ABC.h5.subfile__01_of_20, and + * ABC.h5.subfile_.config + */ + int numD = numDigits(n_io_concentrators); + sprintf(filepath, "%s/%s" SF_FILENAME_TEMPLATE, subfile_dir, base, h5_file_id, numD, subfile_rank, + n_io_concentrators); + if (prefix) + HDfree(prefix); +} + +/*------------------------------------------------------------------------- + * Function: Public/IOC subfiling_open_file + * + * Purpose: This function gets called when a client invokes a OPEN_OP. + * The HDF5 file opening protocol actually attempts to open + * a file; first without any truncate other flags which would + * modify the file state if it already exists. A file close + * and then the second file open using the user supplied open + * flags is invoked. The OPEN_OP provides the user flags as + * part of the RPC message. The file prefix info doesn't + * transmitted as part of the RPC since it is available as + * part of the client context which can be utilized by the + * IOC thread. We access the sf_context by reading the + * cache of contexts at the index provided with the RPC msg. + * + * Return: The integer status returned by the Internal read_independent + * function. Successful operations will return 0. + * Errors: An MPI related error value. + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ +int +subfiling_open_file(sf_work_request_t *msg, int subfile_rank, int flags) +{ + int errors = 0; + char filepath[PATH_MAX]; + char linebuf[PATH_MAX]; + + char * temp = NULL; + char * prefix = NULL; + char * subfile_dir = NULL; + char * base = NULL; + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + + double t_start = 0.0, t_end = 0.0; + /* Only the real IOCs open the subfiles + * Once a file is opened, all additional file open requests + * can return immediately. + */ + + t_start = MPI_Wtime(); + /* Only allow the actual IO concentrator ranks to create sub-files */ + if (subfile_rank >= 0) { + int k, retries = 2; + int64_t h5_file_id = msg->header[1]; + int64_t file_context_id = msg->header[2]; + subfiling_context_t *sf_context = get__subfiling_object(file_context_id); + assert(sf_context != NULL); + + memset(filepath, 0, PATH_MAX); + + begin_thread_exclusive(); + /* Check to see whether we need to create the subfile + * and possibly (IFF our subfile_rank is 0) a config file. + */ + + get__subfile_name(sf_context, h5_file_id, subfile_rank, &base, &subfile_dir, filepath); + sf_context->sf_filename = strdup(filepath); + + assert(sf_context->sf_filename); + + /* Check if we need to create the subfiles */ + if (sf_context->sf_fid == -2) { + int n_io_concentrators = sf_context->topology->n_io_concentrators; + int *io_concentrator = sf_context->topology->io_concentrator; + for (k = 0; k < retries; k++) { + int fd; + if ((fd = HDopen(filepath, O_CREAT | O_RDWR | O_TRUNC, mode)) > 0) { + sf_context->sf_fid = fd; + sf_context->sf_eof = 0; + break; + } + } + if (sf_context->sf_fid < 0) { + end_thread_exclusive(); + perror("subfiling_open_file/open"); + HDprintf("[%d %s] file create(%s) failed!\n", subfile_rank, __func__, filepath); + HDfflush(stdout); + +#ifndef NDEBUG + if (sf_verbose_flag) { + printf("[%d %s] file create(%s) failed!\n", subfile_rank, __func__, filepath); + fflush(stdout); + } +#endif + errors++; + goto done; + } + sprintf(filepath, "%s/%s.subfile_%ld.config", subfile_dir, base, h5_file_id); + /* SUBFILE rank 0 does the work creating a config file */ + if ((subfile_rank == 0) && (flags & O_CREAT)) { + FILE *f = NULL; + /* If a config file already exists, AND + * the user wants to truncate subfiles (if they exist), + * then we should also truncate an existing config file. + */ + if (access(filepath, flags) == 0) { + truncate(filepath, 0); + } + f = HDfopen(filepath, "w+"); + if (f != NULL) { + sprintf(linebuf, "stripe_size=%ld\n", sf_context->sf_stripe_size); + HDfwrite(linebuf, 1, strlen(linebuf), f); + sprintf(linebuf, "aggregator_count=%d\n", n_io_concentrators); + HDfwrite(linebuf, 1, strlen(linebuf), f); + sprintf(linebuf, "hdf5_file=%s\n", sf_context->h5_filename); + HDfwrite(linebuf, 1, strlen(linebuf), f); + sprintf(linebuf, "subfile_dir=%s\n", subfile_dir); + + int numD = numDigits(n_io_concentrators); + for (k = 0; k < n_io_concentrators; k++) { + sprintf(linebuf, "%s" SF_FILENAME_TEMPLATE "\n", base, h5_file_id, numD, k, + n_io_concentrators); + HDfwrite(linebuf, 1, strlen(linebuf), f); + } + + fclose(f); + } + else { + perror("fopen(config)"); + errors++; + goto done; + } + } + +#ifndef NDEBUG + if (sf_verbose_flag) { + if (sf_logfile) { + HDfprintf(sf_logfile, "[ioc:%d] Opened subfile %s\n", subfile_rank, filepath); + } + } +#endif + } + else { + for (k = 0; k < retries; k++) { + int fd; + if ((fd = HDopen(filepath, O_CREAT | O_RDWR, mode)) > 0) { + sf_context->sf_fid = fd; + break; + } + } + if (sf_context->sf_fid < 0) { + end_thread_exclusive(); + perror("subfiling_open_file/open"); + HDprintf("[%d %s] file open(%s) failed!\n", subfile_rank, __func__, filepath); + HDfflush(stdout); + +#ifndef NDEBUG + if (sf_verbose_flag) { + HDprintf("[%d %s] file open(%s) failed!\n", subfile_rank, __func__, filepath); + HDfflush(stdout); + } +#endif + errors++; + goto done; + } + } + end_thread_exclusive(); + } + +done: + t_end = MPI_Wtime(); + if (base) + HDfree(base); + if (subfile_dir) + HDfree(subfile_dir); + +#ifndef NDEBUG + if (sf_verbose_flag) { + printf("[%s %d] open completed in %lf seconds with %d errors\n", __func__, subfile_rank, + (t_end - t_start), errors); + fflush(stdout); + } +#endif + return errors; +} /* end subfiling_open_file() */ + +/*------------------------------------------------------------------------- + * Function: UTILITY FUNCTIONS: + * + * sf_get_mpi_rank - (not used) retrieves the MPI rank of the + * calling process. Was used when pairing + * the subfiling VFD with the SUBFILING VFD. + * + * sf_get_mpi_size - (not used) retrieves the MPI size of the + * communicator associated with the open + * file. + * + * sf_get_group_com - (not used) retrieves the MPI Comm object + * associated with the open file/sf_context. + * + * sf_subfile_set_logging - (not used) informs one or all IOC + * instances to set the verbose/logging flag + * to the value provided by the user. + * + * Return: none + * Errors: none + * + * Programmer: Richard Warren + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ + +int +sf_get_mpi_rank(hid_t fid, int *rank) +{ + hid_t context_id = fid_map_to_context((uint64_t)fid); + subfiling_context_t *sf_context = get__subfiling_object(context_id); + assert(sf_context != NULL); + assert(rank != NULL); + *rank = sf_context->sf_group_rank; + return 0; +} + +int +sf_get_mpi_size(hid_t fid, int *size) +{ + hid_t context_id = fid_map_to_context((uint64_t)fid); + subfiling_context_t *sf_context = get__subfiling_object(context_id); + assert(sf_context != NULL); + assert(size != NULL); + *size = sf_context->sf_group_size; + return 0; +} + +int +sf_get_group_comm(hid_t fid, MPI_Comm *comm) +{ + hid_t context_id = fid_map_to_context((uint64_t)fid); + subfiling_context_t *sf_context = get__subfiling_object(context_id); + assert(sf_context != NULL); + assert(comm != NULL); + *comm = sf_context->sf_group_comm; + return 0; +} + +int +sf_subfile_set_logging(hid_t sf_fid, int ioc_rank, int flag) +{ + int ioc; + int status = 0; + hid_t context_id = fid_map_to_context((uint64_t)sf_fid); + subfiling_context_t *sf_context = get__subfiling_object(context_id); + int n_io_concentrators; + int * io_concentrator = NULL; + int64_t lflag = (int64_t)(flag & 0xFF); + int64_t msg[3]; + + assert(sf_context != NULL); + + msg[0] = lflag; + msg[1] = 0; + msg[2] = sf_context->sf_context_id; + + n_io_concentrators = sf_context->topology->n_io_concentrators; + io_concentrator = sf_context->topology->io_concentrator; + + for (ioc = 0; ioc < n_io_concentrators; ioc++) { + if ((flag < 0) || (flag == ioc_rank)) { + status = + MPI_Ssend(msg, 3, MPI_INT64_T, io_concentrator[ioc], LOGGING_OP, sf_context->sf_msg_comm); + } + } + return status; +} + +/*------------------------------------------------------------------------- + * Function: report_sf_eof + * + * Purpose: Determine the target sub-file's eof and report this value + * to the requesting rank. + * + * Notes: This function will have to be reworked once we solve + * the IOC error reporting problem. + * + * This function mixes functionality that should be + * in two different VFDs. + * + * Return: 0 if successful, 1 or an MPI error code on failure. + * + * Programmer: John Mainzer + * 7/17/2020 + * + * Changes: Initial Version/None. + * + *------------------------------------------------------------------------- + */ + +int +report_sf_eof(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm) +{ + int fd; + int mpi_ret; + int64_t eof_req_reply[3]; + int64_t file_context_id; + subfiling_context_t *sf_context = NULL; + h5_stat_t sb; + + HDassert(msg); + + /* first get the EOF of the target file. */ + + file_context_id = msg->header[2]; + + if (NULL == (sf_context = get__subfiling_object(file_context_id))) { + + HDfprintf(stdout, "report_sf_eof: get__subfiling_object() failed.\n"); + HDfflush(stdout); + return (1); + } + + fd = sf_context->sf_fid; + + if (HDfstat(fd, &sb) < 0) { + + HDfprintf(stdout, "report_sf_eof: get__subfiling_object() failed.\n"); + HDfflush(stdout); + return (1); + } + + eof_req_reply[0] = (int64_t)subfile_rank; + eof_req_reply[1] = (int64_t)(sb.st_size); + eof_req_reply[2] = 0; /* not used */ + + /* return the subfile EOF to the querying rank */ + if (MPI_SUCCESS != (mpi_ret = MPI_Send(eof_req_reply, 3, MPI_INT64_T, source, GET_EOF_COMPLETED, comm))) { + + HDfprintf(stdout, "report_sf_eof: MPI_Send failed -- return code = %d.\n", mpi_ret); + HDfflush(stdout); + return (mpi_ret); + } + + return 0; + +} /* report_sf_eof() */ diff --git a/src/H5FDsubfiling.c b/src/H5FDsubfiling.c new file mode 100644 index 00000000000..38c60d0659b --- /dev/null +++ b/src/H5FDsubfiling.c @@ -0,0 +1,2886 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Programmer: Richard Warren + * + * + * Purpose: An initial implementation of a subfiling VFD which is + * derived from other "stacked" VFDs such as the splitter, + * mirror, and family VFDs. + */ + +#define H5S_FRIEND /*suppress error about including H5Spkg */ +#include "H5FDdrvr_module.h" /* This source code file is part of the H5FD driver module */ + +#include "H5CXprivate.h" /* API contexts, etc. */ +#include "H5Dprivate.h" /* Dataset stuff */ +#include "H5Eprivate.h" /* Error handling */ +#include "H5FDprivate.h" /* File drivers */ +#include "H5FDsubfiling.h" /* Subfiling file driver */ +#include "H5FLprivate.h" /* Free Lists */ +#include "H5Fprivate.h" /* File access */ +#include "H5Iprivate.h" /* IDs */ +#include "H5MMprivate.h" /* Memory management */ +#include "H5Pprivate.h" /* Property lists */ +#include "H5Spkg.h" /* For selections and creation of subfiling vectors */ +#include "H5private.h" /* Generic Functions */ +#include "H5FDioc.h" /* IOC */ + +/* The driver identification number, initialized at runtime */ +static hid_t H5FD_SUBFILING_g = 0; + +#ifndef NDEBUG +FILE *sf_logfile = NULL; +FILE *client_log = NULL; +#endif + +/* These are used for the creation of read or write vectors */ +static haddr_t * sf_offsets = NULL; +static hssize_t *sf_sizes = NULL; +static void ** sf_bufs = NULL; + +/* The description of a file belonging to this driver. The 'eoa' and 'eof' + * determine the amount of hdf5 address space in use and the high-water mark + * of the file (the current size of the underlying filesystem file). The + * 'pos' value is used to eliminate file position updates when they would be a + * no-op. Unfortunately we've found systems that use separate file position + * indicators for reading and writing so the lseek can only be eliminated if + * the current operation is the same as the previous operation. When opening + * a file the 'eof' will be set to the current file size, `eoa' will be set + * to zero, 'pos' will be set to H5F_ADDR_UNDEF (as it is when an error + * occurs), and 'op' will be set to H5F_OP_UNKNOWN. + */ +/*************************************************************************** + * + * Structure: H5FD_subfiling_t + * + * Purpose: + * + * H5FD_subfiling_t is a structure used to store all information needed + * to setup, manage, and take down subfiling for a HDF5 file. + * + * This structure is created when such a file is "opened" and + * discarded when it is "closed". + * + * Presents a system of subfiles as a single file to the HDF5 library. + * + * + * `pub` (H5FD_t) + * + * Instance of H5FD_t which contains all fields common to all VFDs. + * It must be the first item in this structure, since at higher levels, + * this structure will be treated as an instance of H5FD_t. + * + * `fa` (H5FD_subfiling_config_t) + * + * Instance of `H5FD_subfiling_config_t` containing the subfiling + * configuration data needed to "open" the HDF5 file. + * + * + * Document additional subfiling fields here. + * + * Recall that the existing fields are inherited from the sec2 driver + * and should be kept or not as appropriate for the sub-filing VFD. + * + * + * Programmer: Richard Warren + * + ***************************************************************************/ + +typedef struct H5FD_subfiling_t { + H5FD_t pub; /* public stuff, must be first */ + int fd; /* the filesystem file descriptor */ + H5FD_subfiling_config_t fa; /* driver-specific file access properties */ + + /* the following fields are inherited from the sec2 VFD, and will + * likely be deleted. + */ + int mpi_rank; /* useful MPI information */ + int mpi_size; + H5FD_t *sf_file; + +#ifndef H5_HAVE_WIN32_API + /* On most systems the combination of device and i-node number uniquely + * identify a file. Note that Cygwin, MinGW and other Windows POSIX + * environments have the stat function (which fakes inodes) + * and will use the 'device + inodes' scheme as opposed to the + * Windows code further below. + */ + dev_t device; /* file device number */ + ino_t inode; /* file i-node number */ +#else + /* Files in windows are uniquely identified by the volume serial + * number and the file index (both low and high parts). + * + * There are caveats where these numbers can change, especially + * on FAT file systems. On NTFS, however, a file should keep + * those numbers the same until renamed or deleted (though you + * can use ReplaceFile() on NTFS to keep the numbers the same + * while renaming). + * + * See the MSDN "BY_HANDLE_FILE_INFORMATION Structure" entry for + * more information. + * + * http://msdn.microsoft.com/en-us/library/aa363788(v=VS.85).aspx + */ + DWORD nFileIndexLow; + DWORD nFileIndexHigh; + DWORD dwVolumeSerialNumber; + + HANDLE hFile; /* Native windows file handle */ +#endif /* H5_HAVE_WIN32_API */ + + /* + * The element layouts above this point are identical with the + * H5FD_ioc_t structure. As a result, + * + * Everything which follows is unique to the H5FD_subfiling_t + */ + haddr_t eoa; /* end of allocated region */ + haddr_t eof; /* end of file; current file size */ + haddr_t pos; /* current file I/O position */ + H5FD_file_op_t op; /* last operation */ + /* Copy of file name from open operation */ + char filename[H5FD_MAX_FILENAME_LEN]; + MPI_Info info; + MPI_Comm comm; + + /* Information from properties set by 'h5repart' tool + * + * Whether to eliminate the family driver info and convert this file to + * a single file. + */ + hbool_t fam_to_single; +} H5FD_subfiling_t; + +/* + * These macros check for overflow of various quantities. These macros + * assume that HDoff_t is signed and haddr_t and size_t are unsigned. + * + * ADDR_OVERFLOW: Checks whether a file address of type `haddr_t' + * is too large to be represented by the second argument + * of the file seek function. + * + * SIZE_OVERFLOW: Checks whether a buffer size of type `hsize_t' is too + * large to be represented by the `size_t' type. + * + * REGION_OVERFLOW: Checks whether an address and size pair describe data + * which can be addressed entirely by the second + * argument of the file seek function. + */ +#define MAXADDR (((haddr_t)1 << (8 * sizeof(HDoff_t) - 1)) - 1) +#define ADDR_OVERFLOW(A) (HADDR_UNDEF == (A) || ((A) & ~(haddr_t)MAXADDR)) +#define SIZE_OVERFLOW(Z) ((Z) & ~(hsize_t)MAXADDR) +#define REGION_OVERFLOW(A, Z) \ + (ADDR_OVERFLOW(A) || SIZE_OVERFLOW(Z) || HADDR_UNDEF == (A) + (Z) || (HDoff_t)((A) + (Z)) < (HDoff_t)(A)) + +#define H5FD_IOC_DEBUG_OP_CALLS 0 /* debugging print toggle; 0 disables */ + +#if H5FD_SUBFILING_DEBUG_OP_CALLS +#define H5FD_SUBFILING_LOG_CALL(name) \ + do { \ + HDprintf("called %s()\n", (name)); \ + HDfflush(stdout); \ + } while (0) +#else +#define H5FD_SUBFILING_LOG_CALL(name) /* no-op */ +#endif /* H5FD_SUBFILING_DEBUG_OP_CALLS */ + +/* Prototypes */ +extern herr_t H5Pset_fapl_sec2(hid_t fapl_id); +static herr_t H5FD__subfiling_term(void); +static void * H5FD__subfiling_fapl_get(H5FD_t *_file); +static void * H5FD__subfiling_fapl_copy(const void *_old_fa); +static herr_t H5FD__subfiling_fapl_free(void *_fa); +static H5FD_t *H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr); +static herr_t H5FD__subfiling_close(H5FD_t *_file); +static int H5FD__subfiling_cmp(const H5FD_t *_f1, const H5FD_t *_f2); +static herr_t H5FD__subfiling_query(const H5FD_t *_f1, unsigned long *flags); +static haddr_t H5FD__subfiling_get_eoa(const H5FD_t *_file, H5FD_mem_t type); +static herr_t H5FD__subfiling_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr); +static haddr_t H5FD__subfiling_get_eof(const H5FD_t *_file, H5FD_mem_t type); +static herr_t H5FD__subfiling_get_handle(H5FD_t *_file, hid_t fapl, void **file_handle); +static herr_t H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t fapl_id, haddr_t addr, size_t size, + void *buf); +static herr_t H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size, + const void *buf); + +static herr_t H5FD__subfiling_read_vector(H5FD_t *file, hid_t dxpl_id, uint32_t count, H5FD_mem_t types[], + haddr_t addrs[], size_t sizes[], void *bufs[] /* out */); +static herr_t H5FD__subfiling_write_vector(H5FD_t *file, hid_t dxpl_id, uint32_t count, H5FD_mem_t types[], + haddr_t addrs[], size_t sizes[], const void *bufs[] /* in */); + +static herr_t H5FD__subfiling_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing); + +static herr_t H5FD__subfiling_lock(H5FD_t *_file, hbool_t rw); +static herr_t H5FD__subfiling_unlock(H5FD_t *_file); +static herr_t H5FD__subfiling_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, + const void H5_ATTR_UNUSED *input, void **output); + +static herr_t H5FD__subfiling_validate_config(const H5FD_subfiling_config_t *fa); + +#if 0 /* JRM */ /* delete if all goes well */ +static int H5FD__subfiling_mpi_rank(const H5FD_t *_file); +static int H5FD__subfiling_mpi_size(const H5FD_t *_file); +static MPI_Comm H5FD__subfiling_communicator(const H5FD_t *_file); +#endif /* JRM */ +#if 0 /* JRM */ /* unused? delete if so */ +static herr_t H5FD__subfiling_get_info(H5FD_t *_file, void **mpi_info); +#endif /* JRM */ + +static const H5FD_class_t H5FD_subfiling_g = { + H5FD_SUBFILING_VALUE, /* value */ + "subfiling", /* name */ + MAXADDR, /* maxaddr */ + H5F_CLOSE_WEAK, /* fc_degree */ + H5FD__subfiling_term, /* terminate */ + NULL, /* sb_size */ + NULL, /* sb_encode */ + NULL, /* sb_decode */ + sizeof(H5FD_subfiling_config_t), /* fapl_size */ + H5FD__subfiling_fapl_get, /* fapl_get */ + H5FD__subfiling_fapl_copy, /* fapl_copy */ + H5FD__subfiling_fapl_free, /* fapl_free */ + 0, /* dxpl_size */ + NULL, /* dxpl_copy */ + NULL, /* dxpl_free */ + H5FD__subfiling_open, /* open */ + H5FD__subfiling_close, /* close */ + H5FD__subfiling_cmp, /* cmp */ + H5FD__subfiling_query, /* query */ + NULL, /* get_type_map */ + NULL, /* alloc */ + NULL, /* free */ + H5FD__subfiling_get_eoa, /* get_eoa */ + H5FD__subfiling_set_eoa, /* set_eoa */ + H5FD__subfiling_get_eof, /* get_eof */ + H5FD__subfiling_get_handle, /* get_handle */ + H5FD__subfiling_read, /* read */ + H5FD__subfiling_write, /* write */ + H5FD__subfiling_read_vector, /* read_vector */ + H5FD__subfiling_write_vector, /* write_vector */ + NULL, /* read_selection */ + NULL, /* write_selection */ + NULL, /* flush */ + H5FD__subfiling_truncate, /* truncate */ + H5FD__subfiling_lock, /* lock */ + H5FD__subfiling_unlock, /* unlock */ + NULL, /* del */ + H5FD__subfiling_ctl, /* ctl */ + H5FD_FLMAP_DICHOTOMY /* fl_map */ +}; + +/* Declare a free list to manage the H5FD_subfiling_t struct */ +H5FL_DEFINE_STATIC(H5FD_subfiling_t); + +/*------------------------------------------------------------------------- + * Function: H5FD__init_package + * + * Purpose: Initializes any interface-specific data or routines. + * + * Return: Non-negative on success/Negative on failure + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__init_package(void) +{ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + + if (H5FD_subfiling_init() < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "unable to initialize subfiling VFD") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5FD__init_package() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_init + * + * Purpose: Initialize this driver by registering the driver with the + * library. + * + * Return: Success: The driver ID for the subfiling driver + * Failure: H5I_INVALID_HID + * + * Programmer: Richard Warren + * + *------------------------------------------------------------------------- + */ +hid_t +H5FD_subfiling_init(void) +{ + hid_t ret_value = H5I_INVALID_HID; /* Return value */ + + FUNC_ENTER_NOAPI(H5I_INVALID_HID) + +#if 1 /* JRM */ + if (H5I_VFL != H5I_get_type(H5FD_SUBFILING_g)) + H5FD_SUBFILING_g = H5FD_register(&H5FD_subfiling_g, sizeof(H5FD_class_t), FALSE); +#else /* JRM */ + if (H5I_VFL != H5I_get_type(H5FD_SUBFILING_g)) { + HDfprintf(stdout, "H5FD_subfiling_init(): calling H5FD_register()\n"); + H5FD_SUBFILING_g = H5FD_register(&H5FD_subfiling_g, sizeof(H5FD_class_t), FALSE); + } +#endif /* JRM */ + +#if 0 /* JRM */ + HDfprintf(stdout, "H5FD_subfiling_init() subfiling registered. id = %lld \n", (int64_t)H5FD_SUBFILING_g); +#endif /* JRM */ + + /* Set return value */ + ret_value = H5FD_SUBFILING_g; + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_subfiling_init() */ + +/*--------------------------------------------------------------------------- + * Function: H5FD_subfiling_term + * + * Purpose: Shut down the VFD + * + * Returns: SUCCEED (Can't fail) + * + * Programmer: Quincey Koziol + * Friday, Jan 30, 2004 + * + *--------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_term(void) +{ + FUNC_ENTER_NOAPI_NOINIT_NOERR + +#if 0 /* JRM */ + HDfprintf(stdout, "Entering H5FD__subfiling_term().\n"); +#endif /* JRM */ + + /* Reset VFL ID */ + H5FD_SUBFILING_g = 0; + +#if 0 /* JRM */ + HDfprintf(stdout, "Exiting H5FD__subfiling_term().\n"); +#endif /* JRM */ + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* end H5FD_subfiling_term() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__copy_plist + * + * Purpose: Sanity-wrapped H5P_copy_plist() for each channel. + * Utility function for operation in multiple locations. + * + * Return: 0 on success, -1 on error. + *------------------------------------------------------------------------- + */ +static int +H5FD__copy_plist(hid_t fapl_id, hid_t *id_out_ptr) +{ + int ret_value = 0; + H5P_genplist_t *plist_ptr = NULL; + + FUNC_ENTER_STATIC + + H5FD_SUBFILING_LOG_CALL(FUNC); + + HDassert(id_out_ptr != NULL); + + if (FALSE == H5P_isa_class(fapl_id, H5P_FILE_ACCESS)) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, -1, "not a file access property list"); + + plist_ptr = (H5P_genplist_t *)H5I_object(fapl_id); + if (NULL == plist_ptr) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, -1, "unable to get property list"); + + *id_out_ptr = H5P_copy_plist(plist_ptr, FALSE); + if (H5I_INVALID_HID == *id_out_ptr) + HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, -1, "unable to copy file access property list"); + +done: + FUNC_LEAVE_NOAPI(ret_value); +} /* end H5FD__copy_plist() */ + +static herr_t +fapl__get_subfiling_defaults(H5FD_subfiling_config_t *fa) +{ + herr_t ret_value = SUCCEED; + char * envValue = NULL; + + HDassert(fa); + + fa->common.magic = H5FD_SUBFILING_FAPL_T_MAGIC; + fa->common.version = H5FD_CURR_SUBFILING_FAPL_T_VERSION; + fa->common.ioc_fapl_id = H5P_DEFAULT; + fa->common.stripe_count = 0; + fa->common.stripe_depth = H5FD_DEFAULT_STRIPE_DEPTH; + fa->common.ioc_selection = SELECT_IOC_ONE_PER_NODE; + /* VFD specific */ + fa->require_ioc = TRUE; + + if ((envValue = getenv("H5_REQUIRE_IOC")) != NULL) { + int value_check = atoi(envValue); + if (value_check == 0) { + fa->require_ioc = FALSE; + } + else if (value_check > 0) { + fa->require_ioc = TRUE; + } + } + return (ret_value); +} + +/*------------------------------------------------------------------------- + * + * Function: H5Pset_fapl_subfiling + * + * Purpose: Modify the file access property list to use the + * H5FD_SUBFILING driver defined in this source file. All + * driver specific properties are passed in as a pointer to + * a suitably initialized instance of H5FD_subfiling_config_t + * + * Return: SUCCEED/FAIL + * + * Programmer: John Mainzer + * 9/10/17 + * + * Changes: None. + * + *------------------------------------------------------------------------- + */ +herr_t +H5Pset_fapl_subfiling(hid_t fapl_id, H5FD_subfiling_config_t *fa) +{ + H5P_genplist_t * plist = NULL; /* Property list pointer */ + hid_t ioc_fapl = H5I_INVALID_HID; + H5FD_ioc_config_t ioc_config; + H5FD_subfiling_config_t subfiling_conf; + herr_t ret_value = FAIL; + + FUNC_ENTER_API(FAIL) + H5TRACE2("e", "i*!", fapl_id, fa); + + if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list") + + if (fa == NULL) { + /* Create IOC fapl */ + ioc_fapl = H5Pcreate(H5P_FILE_ACCESS); + if (H5I_INVALID_HID == ioc_fapl) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't create ioc fapl") + + /* Get subfiling VFD defaults */ + if (fapl__get_subfiling_defaults(&subfiling_conf) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't get subfiling fapl") + + if (subfiling_conf.require_ioc) { + /* Get IOC VFD defaults */ + if (H5Pget_fapl_ioc(ioc_fapl, &ioc_config) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't get ioc fapl") + + /* Now we can set the IOC fapl. */ + if (H5Pset_fapl_ioc(ioc_fapl, &ioc_config) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set ioc fapl") + } + else { + if (H5Pset_fapl_sec2(ioc_fapl) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set sec2 fapl") + } + + /* Assign the IOC fapl as the underlying VPD */ + subfiling_conf.common.ioc_fapl_id = ioc_fapl; + + fa = &subfiling_conf; + } + + if (FAIL == H5FD__subfiling_validate_config(fa)) { + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid subfiling config") + } + + ret_value = H5P_set_driver(plist, H5FD_SUBFILING, (void *)fa, NULL); + +done: + FUNC_LEAVE_API(ret_value) + +} /* end H5Pset_fapl_subfiling() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_validate_config() + * + * Purpose: Test to see if the supplied instance of + * H5FD_subfiling_config_t contains internally consistent data. + * Return SUCCEED if so, and FAIL otherwise. + * + * Note the difference between internally consistent and + * correct. As we will have to try to setup subfiling to + * determine whether the supplied data is correct, + * we will settle for internal consistency at this point + * + * Return: SUCCEED if instance of H5FD_subfiling_config_t contains + * internally consistent data, FAIL otherwise. + * + * Programmer: Jacob Smith + * 9/10/17 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_validate_config(const H5FD_subfiling_config_t *fa) +{ + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT + + HDassert(fa != NULL); + + if (fa->common.version != H5FD_CURR_SUBFILING_FAPL_T_VERSION) { + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "Unknown H5FD_subfiling_config_t version"); + } + + /* add subfiling configuration validation code here */ + +done: + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD_subfiling_validate_config() */ + +/*------------------------------------------------------------------------- + * Function: H5Pget_fapl_subfiling + * + * Purpose: Returns information about the subfiling file access + * property list though the function arguments. + * + * Return: Success: Non-negative + * + * Failure: Negative + * + * Programmer: John Mainzer + * 9/10/17 + * Modifications: + * Richard Warren + * If the fapl has yet to be set, we return an instance + * with default values for most fields. + * + *------------------------------------------------------------------------- + */ +herr_t +H5Pget_fapl_subfiling(hid_t fapl_id, H5FD_subfiling_config_t *config_out) +{ + const H5FD_subfiling_config_t *config_ptr = NULL; + H5P_genplist_t * plist = NULL; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_API(FAIL) + H5TRACE2("e", "i*!", fapl_id, config_out); + + if (config_out == NULL) { + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "config_out is NULL") + } + + plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS); + if (plist == NULL) { + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access list") + } + + config_ptr = (const H5FD_subfiling_config_t *)H5P_peek_driver_info(plist); + if (config_ptr == NULL) { + ret_value = fapl__get_subfiling_defaults(config_out); + } + else { + /* Copy the subfiling fapl data out */ + HDmemcpy(config_out, config_ptr, sizeof(H5FD_subfiling_config_t)); + + /* Copy the driver info value */ + if (H5FD__copy_plist(config_ptr->common.ioc_fapl_id, &(config_out->common.ioc_fapl_id)) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't copy IOC FAPL"); + } + +done: + FUNC_LEAVE_API(ret_value) + +} /* end H5Pget_fapl_subfiling() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_fapl_get + * + * Purpose: Gets a file access property list which could be used to + * create an identical file. + * + * Return: Success: Ptr to new file access property list value. + * + * Failure: NULL + * + * Programmer: John Mainzer + * 9/8/17 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static void * +H5FD__subfiling_fapl_get(H5FD_t *_file) +{ + H5FD_subfiling_t * file = (H5FD_subfiling_t *)_file; + H5FD_subfiling_config_t *fa = NULL; + void * ret_value = NULL; + + FUNC_ENTER_NOAPI_NOINIT + + fa = (H5FD_subfiling_config_t *)H5MM_calloc(sizeof(H5FD_subfiling_config_t)); + + if (fa == NULL) { + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed") + } + + /* Copy the fields of the structure */ + HDmemcpy(fa, &(file->fa), sizeof(H5FD_subfiling_config_t)); + + /* Set return value */ + ret_value = fa; + +done: + if (ret_value == NULL) { + + if (fa != NULL) { + H5MM_xfree(fa); + } + } + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD_subfiling_fapl_get() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_fapl_copy + * + * Purpose: Copies the subfiling-specific file access properties. + * + * Return: Success: Ptr to a new property list + * + * Failure: NULL + * + * Programmer: John Mainzer + * 9/8/17 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static void * +H5FD__subfiling_fapl_copy(const void *_old_fa) +{ + const H5FD_subfiling_config_t *old_fa = (const H5FD_subfiling_config_t *)_old_fa; + H5FD_subfiling_config_t * new_fa = NULL; + void * ret_value = NULL; + + FUNC_ENTER_NOAPI_NOINIT + + new_fa = (H5FD_subfiling_config_t *)H5MM_malloc(sizeof(H5FD_subfiling_config_t)); + if (new_fa == NULL) { + HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed"); + } + + HDmemcpy(new_fa, old_fa, sizeof(H5FD_subfiling_config_t)); + ret_value = new_fa; + +done: + if (ret_value == NULL) { + + if (new_fa != NULL) { + H5MM_xfree(new_fa); + } + } + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD_subfiling_fapl_copy() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__subfiling_fapl_free + * + * Purpose: Frees the subfiling-specific file access properties. + * + * Return: SUCCEED (cannot fail) + * + * Programmer: John Mainzer + * 9/8/17 + * + * Modifications: + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_fapl_free(void *_fa) +{ + H5FD_subfiling_config_t *fa = (H5FD_subfiling_config_t *)_fa; + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + HDassert(fa != NULL); /* sanity check */ + + H5MM_xfree(fa); + + FUNC_LEAVE_NOAPI(SUCCEED) + +} /* end H5FD_subfiling_fapl_free() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__subfiling_open + * + * Purpose: Create and/or opens a file as an HDF5 file. + * + * Return: Success: A pointer to a new file data structure. The + * public fields will be initialized by the + * caller, which is always H5FD_open(). + * Failure: NULL + * + * Programmer: Richard Warren + * Thursday, July 29, 1999 + * + *------------------------------------------------------------------------- + */ +static H5FD_t * +H5FD__subfiling_open(const char *name, unsigned flags, hid_t subfiling_fapl_id, haddr_t maxaddr) +{ + H5FD_subfiling_t * file_ptr = NULL; /* Subfiling VFD info */ + const H5FD_subfiling_config_t *config_ptr = NULL; /* Driver-specific property list */ + H5FD_class_t * driver = NULL; /* VFD for file */ + H5P_genplist_t * plist_ptr = NULL; + H5FD_driver_prop_t driver_prop; /* Property for driver ID & info */ + +#if 0 /* JRM */ + hbool_t err_occurred = FALSE; + uint64_t h5_file_id = (uint64_t)-1; +#endif /* JRM */ + H5FD_t *ret_value = NULL; +#if 0 /* JRM */ + hid_t fapl_check; + hid_t ioc_fapl_id; +#endif /* JRM */ +#if 1 /* JRM */ + int mpi_code; /* MPI return code */ + MPI_Comm comm = MPI_COMM_NULL; /* MPI Communicator, from plist */ + MPI_Info info = MPI_INFO_NULL; /* MPI Info, from plist */ + int mpi_rank = INT_MAX; /* MPI rank of this process */ + int mpi_size; /* Total number of MPI processes */ +#endif /* JRM */ + + FUNC_ENTER_STATIC + + /* Check arguments */ + if (!name || !*name) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, NULL, "invalid file name") + if (0 == maxaddr || HADDR_UNDEF == maxaddr) + HGOTO_ERROR(H5E_ARGS, H5E_BADRANGE, NULL, "bogus maxaddr") + if (ADDR_OVERFLOW(maxaddr)) + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, NULL, "bogus maxaddr") + + file_ptr = (H5FD_subfiling_t *)H5FL_CALLOC(H5FD_subfiling_t); + if (NULL == file_ptr) + HGOTO_ERROR(H5E_VFL, H5E_CANTALLOC, NULL, "unable to allocate file struct") + + /* Get the driver-specific file access properties */ + plist_ptr = (H5P_genplist_t *)H5I_object(subfiling_fapl_id); + if (NULL == plist_ptr) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list") + +#if 1 /* JRM */ + /* The following code to store MPI communicator, rank, size, and info + * may have to be reworked to make the subfiling VFD pluggable. + */ + /* Get the MPI communicator and info object from the property list */ + if (H5P_get(plist_ptr, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI communicator") + if (H5P_get(plist_ptr, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI info object") + + /* Get the MPI rank of this process and the total number of processes */ + if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank))) + HMPI_GOTO_ERROR(NULL, "MPI_Comm_rank failed", mpi_code) + if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size))) + HMPI_GOTO_ERROR(NULL, "MPI_Comm_size failed", mpi_code) + + /* save MPI data in the instance of H5FD_subfiling_t. This really should be + * done after we successfully open file, but for now follow the existing + * code. + */ + file_ptr->comm = comm; + file_ptr->info = info; + file_ptr->mpi_rank = mpi_rank; + file_ptr->mpi_size = mpi_size; +#endif /* JRM */ + + config_ptr = (const H5FD_subfiling_config_t *)H5P_peek_driver_info(plist_ptr); + if (NULL == config_ptr) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "unable to get VFL driver info") + + memcpy(&file_ptr->fa, config_ptr, sizeof(config_common_t)); + + /* Copy the FAPL from the config structure */ + /* JRM: Why is this necessary? If it is necessary, must close the property list on file close. */ + if (H5FD__copy_plist(config_ptr->common.ioc_fapl_id, &(file_ptr->fa.common.ioc_fapl_id)) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "can't copy IOC FAPL"); + + file_ptr->sf_file = H5FD_open(name, flags, config_ptr->common.ioc_fapl_id, HADDR_UNDEF); + if (!file_ptr->sf_file) + HGOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "unable to open IOC file") + + /* Check the "native" driver (sec2 or mpio) */ + plist_ptr = (H5P_genplist_t *)H5I_object(config_ptr->common.ioc_fapl_id); + + if (H5P_peek(plist_ptr, H5F_ACS_FILE_DRV_NAME, &driver_prop) < 0) + HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, NULL, "can't get driver ID & info") + if (NULL == (driver = (H5FD_class_t *)H5I_object(driver_prop.driver_id))) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, NULL, "invalid driver ID in file access property list") + + if (strncmp(driver->name, "ioc", 3) == 0) { + /* We've already opened the subfiles... */ + H5FD_subfiling_t *ioc_file = (H5FD_subfiling_t *)(file_ptr->sf_file); + /* Get a copy of the context ID for later use */ + file_ptr->fa.common.context_id = ioc_file->fa.common.context_id; + file_ptr->fa.require_ioc = true; + } + else if (strncmp(driver->name, "sec2", 4) == 0) { + uint64_t inode_id = (uint64_t)-1; + int mpi_rank, mpi_size; + int ioc_flags = O_RDWR; + + /* Translate the HDF5 file open flags into standard POSIX open flags */ + if (flags & H5F_ACC_TRUNC) + ioc_flags |= O_TRUNC; + if (flags & H5F_ACC_CREAT) + ioc_flags |= O_CREAT; + + /* Get some basic MPI information */ + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + /* Let MPI rank 0 to the file stat operation and broadcast a result */ + if (mpi_rank == 0) { + if (file_ptr->sf_file) { + H5FD_sec2_t *hdf_file = (H5FD_sec2_t *)file_ptr->sf_file; + h5_stat_t sb; + /* We create a new file descriptor for our file structure. + * Basically, we want these separate so that sec2 can + * deal with the opened file for additional operations + * (especially close) without interfering with subfiling. + */ + file_ptr->fd = HDdup(hdf_file->fd); + if (HDfstat(hdf_file->fd, &sb) < 0) + HSYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, NULL, "unable to fstat file") + inode_id = sb.st_ino; + } + } + + if (MPI_SUCCESS == MPI_Bcast(&inode_id, 1, MPI_UNSIGNED_LONG_LONG, 0, MPI_COMM_WORLD)) { + file_ptr->inode = inode_id; + } + + /* All ranks can now detect an error and fail. */ + if (inode_id == (uint64_t)-1) + HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file = %s\n", name) + + /* See: H5FDsubfile_int.c: + * Note that the user defined HDF5 file is also considered subfile(0) */ + if (H5FD__open_subfiles((void *)&file_ptr->fa, inode_id, ioc_flags) < 0) + HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open subfiling files = %s\n", name) + } + else { + HDputs("We only support ioc and sec2 file opens at the moment."); + HGOTO_ERROR(H5E_FILE, H5E_CANTOPENFILE, NULL, "unable to open file = %s\n", name) + } + ret_value = (H5FD_t *)file_ptr; + +done: + if (NULL == ret_value) { + if (file_ptr) { + if (H5I_INVALID_HID != file_ptr->fa.common.ioc_fapl_id) + H5I_dec_ref(file_ptr->fa.common.ioc_fapl_id); + if (file_ptr->sf_file) + H5FD_close(file_ptr->sf_file); + H5FL_FREE(H5FD_subfiling_t, file_ptr); + } + } /* end if error */ + + // return ret_value; + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__subfiling_open() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__subfiling_close + * + * Purpose: Closes an HDF5 file. + * + * Return: Success: SUCCEED + * Failure: FAIL, file not closed. + * + * Programmer: Richard Warren + * Thursday, July 29, 1999 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_close(H5FD_t *_file) +{ + H5FD_subfiling_t * file_ptr = (H5FD_subfiling_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + subfiling_context_t *sf_context = NULL; + + FUNC_ENTER_NOAPI_NOINIT + + /* Sanity check */ + HDassert(file_ptr); + + sf_context = (subfiling_context_t *)get__subfiling_object(file_ptr->fa.common.context_id); + +#ifdef VERBOSE + if (sf_context->topology->rank_is_ioc) + printf("[%s %d] fd=%d\n", __func__, file_ptr->mpi_rank, sf_context->sf_fid); + else + printf("[%s %d] fd=*\n", __func__, file_ptr->mpi_rank); + fflush(stdout); +#endif + if (H5FD_close(file_ptr->sf_file) != SUCCEED) { + HSYS_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close file") + } + + if (sf_context != NULL) { + if (sf_context->subfile_prefix) { + HDfree(sf_context->subfile_prefix); + sf_context->subfile_prefix = NULL; + } + if (sf_context->sf_filename) { + HDfree(sf_context->sf_filename); + sf_context->sf_filename = NULL; + } + if (sf_context->h5_filename) { + HDfree(sf_context->h5_filename); + sf_context->h5_filename = NULL; + } + } + /* if set, close the copy of the plist for the underlying VFD. */ + if ((H5I_INVALID_HID != file_ptr->fa.common.ioc_fapl_id) && + (H5I_dec_ref(file_ptr->fa.common.ioc_fapl_id) < 0)) + HGOTO_ERROR(H5E_VFL, H5E_ARGS, FAIL, "can't close ioc FAPL") + + /* Release the file info */ + file_ptr = H5FL_FREE(H5FD_subfiling_t, file_ptr); + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_subfiling_close() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_cmp + * + * Purpose: Compares two files belonging to this driver using an + * arbitrary (but consistent) ordering. + * + * Return: Success: A value like strcmp() + * Failure: never fails (arguments were checked by the + * caller). + * + * Programmer: Richard Warren + * Thursday, July 29, 1999 + * + *------------------------------------------------------------------------- + */ +static int +H5FD__subfiling_cmp(const H5FD_t *_f1, const H5FD_t *_f2) +{ + const H5FD_subfiling_t *f1 = (const H5FD_subfiling_t *)_f1; + const H5FD_subfiling_t *f2 = (const H5FD_subfiling_t *)_f2; + int ret_value = 0; + + FUNC_ENTER_NOAPI_NOINIT_NOERR + +#ifdef H5_HAVE_WIN32_API + if (f1->dwVolumeSerialNumber < f2->dwVolumeSerialNumber) + HGOTO_DONE(-1) + if (f1->dwVolumeSerialNumber > f2->dwVolumeSerialNumber) + HGOTO_DONE(1) + + if (f1->nFileIndexHigh < f2->nFileIndexHigh) + HGOTO_DONE(-1) + if (f1->nFileIndexHigh > f2->nFileIndexHigh) + HGOTO_DONE(1) + + if (f1->nFileIndexLow < f2->nFileIndexLow) + HGOTO_DONE(-1) + if (f1->nFileIndexLow > f2->nFileIndexLow) + HGOTO_DONE(1) +#else /* H5_HAVE_WIN32_API */ +#ifdef H5_DEV_T_IS_SCALAR + if (f1->device < f2->device) + HGOTO_DONE(-1) + if (f1->device > f2->device) + HGOTO_DONE(1) +#else /* H5_DEV_T_IS_SCALAR */ + /* If dev_t isn't a scalar value on this system, just use memcmp to + * determine if the values are the same or not. The actual return value + * shouldn't really matter... + */ + if (HDmemcmp(&(f1->device), &(f2->device), sizeof(dev_t)) < 0) + HGOTO_DONE(-1) + if (HDmemcmp(&(f1->device), &(f2->device), sizeof(dev_t)) > 0) + HGOTO_DONE(1) +#endif /* H5_DEV_T_IS_SCALAR */ + if (f1->inode < f2->inode) + HGOTO_DONE(-1) + if (f1->inode > f2->inode) + HGOTO_DONE(1) +#endif /* H5_HAVE_WIN32_API */ + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_subfiling_cmp() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_query + * + * Purpose: Set the flags that this VFL driver is capable of supporting. + * (listed in H5FDpublic.h) + * + * For now, duplicate the flags used for the MPIO VFD. + * Revisit this when we have a version of the subfiling VFD + * that is usable in serial builds. + * + * Return: SUCCEED (Can't fail) + * + * Programmer: John Mainzer + * 11/15/21 + * + *------------------------------------------------------------------------- + */ +#if 0 /* JRM */ /* original version -- delete if all goes well */ +static herr_t +H5FD__subfiling_query(const H5FD_t *_file, unsigned long *flags /* out */) +{ + const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; /* subfiling VFD info */ + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + /* Set the VFL feature flags that this driver supports */ + /* Notice: the Mirror VFD Writer currently uses only the Sec2 driver as + * the underying driver -- as such, the Mirror VFD implementation copies + * these feature flags as its own. Any modifications made here must be + * reflected in H5FDmirror.c + * -- JOS 2020-01-13 + */ + if (flags) { + *flags = 0; + *flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata + allocations */ + *flags |= H5FD_FEAT_ACCUMULATE_METADATA; /* OK to accumulate metadata for + faster writes */ + *flags |= H5FD_FEAT_DATA_SIEVE; /* OK to perform data sieving for faster raw + data reads & writes */ + *flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data + allocations */ + *flags |= H5FD_FEAT_POSIX_COMPAT_HANDLE; /* get_handle callback returns a + POSIX file descriptor */ + *flags |= H5FD_FEAT_SUPPORTS_SWMR_IO; /* VFD supports the + single-writer/multiple-readers + (SWMR) pattern */ + *flags |= H5FD_FEAT_DEFAULT_VFD_COMPATIBLE; + /* Check for flags that are set by h5repart */ + if (file && file->fam_to_single) + *flags |= H5FD_FEAT_IGNORE_DRVRINFO; /* Ignore the driver info when file + is opened (which eliminates it) */ + } /* end if */ + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* end H5FD_subfiling_query() */ +#else /* JRM */ /* new version copied from MPIO VFD */ + +static herr_t +H5FD__subfiling_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out */) +{ + FUNC_ENTER_STATIC_NOERR + + /* Set the VFL feature flags that this driver supports */ + if (flags) { + *flags = 0; + *flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */ + *flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */ + *flags |= H5FD_FEAT_HAS_MPI; /* This driver uses MPI */ + *flags |= H5FD_FEAT_ALLOCATE_EARLY; /* Allocate space early instead of late */ + *flags |= H5FD_FEAT_DEFAULT_VFD_COMPATIBLE; /* VFD creates a file which can be opened with the default + VFD */ + /* this is false -- delete the flag eventually */ + } + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* end H5FD__mpio_query() */ + +#endif /* JRM */ /* new version copied from MPIO VFD */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_get_eoa + * + * Purpose: Gets the end-of-address marker for the file. The EOA marker + * is the first address past the last byte allocated in the + * format address space. + * + * Return: The end-of-address marker. + * + * Programmer: Richard Warren + * + *------------------------------------------------------------------------- + */ +static haddr_t +H5FD__subfiling_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type) +{ + const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; + + FUNC_ENTER_NOAPI_NOINIT_NOERR + + FUNC_LEAVE_NOAPI(file->eoa) +} /* end H5FD_subfiling_get_eoa() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_set_eoa + * + * Purpose: Set the end-of-address marker for the file. This function is + * called shortly after an existing HDF5 file is opened in order + * to tell the driver where the end of the HDF5 data is located. + * + * Return: SUCCEED (Can't fail) + * + * Programmer: Richard Warren + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr) +{ + H5FD_subfiling_t *file_ptr = (H5FD_subfiling_t *)_file; + + FUNC_ENTER_NOAPI_NOINIT_NOERR + file_ptr->eoa = addr; + + H5FD_set_eoa(file_ptr->sf_file, type, addr); + + FUNC_LEAVE_NOAPI(SUCCEED) +} /* end H5FD_subfiling_set_eoa() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_get_eof + * + * Purpose: Returns the end-of-file marker from the filesystem + * perspective. + * + * Return: End of file address, the first address past the end of the + * "file", either the filesystem file or the HDF5 file. + * + * SUBFILING NOTE: + * The EOF calculation for subfiling is somewhat different + * than for the more traditional HDF5 file implementations. + * This statement derives from the fact that unlike "normal" + * HDF5 files, subfiling introduces a multi-file representation + * of a single HDF5 file. The plurality of sub-files represents + * a software RAID-0 based HDF5 file. As such, each sub-file + * contains a designated portion of the address space of the + * virtual HDF5 storage. We have no notion of HDF5 datatypes, + * datasets, metadata, or other HDF5 structures; only BYTES. + * + * The organization of the bytes within sub-files is consistent + * with the RAID-0 striping, i.e. there are IO Concentrators + * (IOCs) which correspond to a stripe-count (in Lustre) as + * well as a stripe_size. The combiniation of these two + * variables determines the "address" (a combination of IOC + * and a file offset) of any storage operation. + * + * Having a defined storage layout, the virtual file EOF + * calculation should be the MAXIMUM value returned by the + * collection of IOCs. Every MPI rank which hosts an IOC + * maintains it's own EOF by updating that value for each + * WRITE operation that completes, i.e. if a new local EOF + * is greater than the existing local EOF, the new EOF + * will replace the old. The local EOF calculation is as + * follows. + * 1. At file creation, each IOC is assigned a rank value + * (0 to N-1, where N is the total number of IOCs) and + * a 'sf_base_addr' = 'subfile_rank' * 'sf_stripe_size') + * we also determine the 'sf_blocksize_per_stripe' which + * is simply the 'sf_stripe_size' * 'n_ioc_concentrators' + * + * 2. For every write operation, the IOC receives a message + * containing a file_offset and the data_size. + * 3. The file_offset + data_size are in turn used to + * create a stripe_id: + * IOC-(ioc_rank) IOC-(ioc_rank+1) + * |<- sf_base_address |<- sf_base_address | + * ID +--------------------+--------------------+ + * 0:|<- sf_stripe_size ->|<- sf_stripe_size ->| + * 1:|<- sf_stripe_size ->|<- sf_stripe_size ->| + * ~ ~ ~ + * N:|<- sf_stripe_size ->|<- sf_stripe_size ->| + * +--------------------+--------------------+ + * + * The new 'stripe_id' is then used to calculate a + * potential new EOF: + * sf_eof = (stripe_id * sf_blocksize_per_stripe) + sf_base_addr + * + ((file_offset + data_size) % sf_stripe_size) + * + * 4. If (sf_eof > current_sf_eof), then current_sf_eof = sf_eof. + * + * + * Programmer: Richard Warren + * + *------------------------------------------------------------------------- + */ +#if 0 /* JRM */ /* original version */ + +static haddr_t +H5FD__subfiling_get_eof(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type) +{ + H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; + haddr_t ret_value = HADDR_UNDEF; + haddr_t local_eof, global_eof = 0; + FUNC_ENTER_STATIC + + local_eof = H5FD_get_eof(file->sf_file, type); + if (MPI_SUCCESS != MPI_Allreduce(&local_eof, &global_eof, 1, MPI_LONG_LONG, MPI_MAX, MPI_COMM_WORLD)) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, HADDR_UNDEF, "mpi_allreduce failed") + /* Return the global max of all the subfile EOF values */ + + ret_value = global_eof; +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_subfiling_get_eof() */ + +#else /* JRM */ /* re-worked version */ + /* this is a heavy weight implementation. We need something like this + * for file open, and probably for file close. However, in between, something + * similar to the current solution in the MPIIO VFD might be more appropriate. + */ + +static haddr_t +H5FD__subfiling_get_eof(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type) +{ + H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; + int64_t logical_eof = -1; + haddr_t ret_value = HADDR_UNDEF; + + FUNC_ENTER_STATIC + + if (H5FD__subfiling__get_real_eof(&logical_eof, file->fa.common.context_id) < 0) + HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, HADDR_UNDEF, "can't get eof") + + /* Return the global max of all the subfile EOF values */ + + ret_value = (haddr_t)(logical_eof); + +#if 0 /* JRM */ + HDfprintf(stdout, "\nH5FD__subfiling_get_eof: reporting eof = %lld\n", (long long)ret_value); + HDfflush(stdout); +#endif /* JRM */ + +done: + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_subfiling_get_eof() */ + +#endif /* JRM */ /* re-worked version */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_get_handle + * + * Purpose: Returns the file handle of subfiling file driver. + * + * Returns: SUCCEED/FAIL + * + * Programmer: Raymond Lu + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_get_handle(H5FD_t *_file, hid_t H5_ATTR_UNUSED fapl, void **file_handle) +{ + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_NOAPI_NOINIT + + if (!file_handle) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file handle not valid") + + *file_handle = &(file->fd); + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_subfiling_get_handle() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_read + * + * Purpose: Reads SIZE bytes of data from FILE beginning at address ADDR + * into buffer BUF according to data transfer properties in + * DXPL_ID. + * + * Return: Success: SUCCEED. Result is stored in caller-supplied + * buffer BUF. + * Failure: FAIL, Contents of buffer BUF are undefined. + * + * Programmer: Richard Warren + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUSED dxpl_id, + haddr_t addr, size_t size, void *buf /*out*/) +{ + H5FD_subfiling_t * file_ptr = (H5FD_subfiling_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + hbool_t addrs_cooked = FALSE; + subfiling_context_t *sf_context = NULL; + int ioc_total, count; + int64_t blocksize; + HDoff_t offset; + + FUNC_ENTER_NOAPI_NOINIT + + HDassert(file_ptr && file_ptr->pub.cls); + HDassert(buf); + + sf_context = (subfiling_context_t *)get__subfiling_object(file_ptr->fa.common.context_id); + + HDassert(sf_context); + HDassert(sf_context->topology); + + /* Given the current IO and the IO concentrator info + * we can determine some IO transaction parameters. + * In particular, for large IO operations, each IOC + * may require multiple IOs to fulfill the user IO + * request. The 'max_depth' variable and number of + * IOCs are used to size the vectors that will be + * used to invoke the underlying IO operations. + */ + ioc_total = sf_context->topology->n_io_concentrators; +#ifdef VERBOSE + printf("[%s %d] fd=%d\n", __func__, file_ptr->mpi_rank, sf_context->sf_fid); + fflush(stdout); +#endif + + if (ioc_total > 1) { + size_t max_depth; + blocksize = sf_context->sf_blocksize_per_stripe; +#if 0 /* JRM */ + size_t max_depth = (size_t)(size / blocksize) + 2; +#else /* JRM */ + max_depth = (size / (size_t)blocksize) + 2; +#endif /* JRM */ + int next, ioc_count = 0, ioc_start = -1; + + int64_t source_data_offset[ioc_total][max_depth], sf_data_size[ioc_total][max_depth], + sf_offset[ioc_total][max_depth]; + + size_t varsize = sizeof(sf_offset); + + memset(source_data_offset, 0, varsize); + memset(sf_data_size, 0, varsize); + memset(sf_offset, 0, varsize); + + /* Check for overflow conditions */ + if (!H5F_addr_defined(addr)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined, addr = %llu", (unsigned long long)addr) + if (REGION_OVERFLOW(addr, size)) + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow, addr = %llu, size = %llu", + (unsigned long long)addr, (unsigned long long)size) + + addr += _file->base_addr; + + /* Follow the example of read_vector (see H5FDint.c) */ + addrs_cooked = TRUE; + + offset = (HDoff_t)addr; + + /* Given the number of io concentrators, we allocate vectors (one per-ioc) + * to contain the translation of the IO request into a collection of io + * requests. The translation is accomplished in the init__indep_io function. + */ + + /* Get the potential set of ioc transactions, i.e. data sizes, + * offsets, and datatypes. These can all be used by either the + * underlying IOC or by sec2. + * + * For now, we assume we're dealing with contiguous datasets. + * Vector IO will probably handle the non-contiguous condition + */ + count = init__indep_io(sf_context, /* We use the context to look up config info */ +#if 0 /* JRM */ + max_depth, ioc_total, source_data_offset, /* (out) Memory offset */ + sf_data_size, /* (out) Length of this contiguous block */ + sf_offset, /* (out) File offset */ +#else /* JRM */ + max_depth, ioc_total, (int64_t *)source_data_offset, /* (out) Memory offset */ + (int64_t *)sf_data_size, /* (out) Length of this contiguous block */ + (int64_t *)sf_offset, /* (out) File offset */ +#endif /* JRM */ + &ioc_start, /* (out) IOC index corresponding to starting offset */ + &ioc_count, /* (out) number of actual IOCs used */ + offset, /* (in) Starting file offset */ +#if 0 /* JRM */ + size, /* (in) IO size */ +#else /* JRM */ + (int64_t)size, /* (in) IO size */ +#endif /* JRM */ + 1); /* (in) data extent of the 'type' assumes byte */ + + if (count > 0) { + int i, k; + + /* Set ASYNC MODE: + H5FD_class_aio_t *async_file_ptr = (H5FD_class_aio_t *)file_ptr->sf_file; + uint64_t op_code_begin = xxx; + uint64_t op_code_complete = zzz; + const void *input = NULL; + void *output = NULL; + (*async_file_ptr->h5fdctl)(file_ptr->sf_file, op_code_begin, flags, input, + &output); + */ + +#if 0 + printf("[%s] addr=%ld, size=%ld, depth=%d, ioc_count=%d, ioc_start=%d\n", + __func__, offset, size, count, ioc_count, ioc_start); + fflush(stdout); +#endif + + /* The 'count' variable captures the max number of IO requests to a single + * IOC whereas the ioc_count is the number of IOC requests per outer loop + * (i) and also represents the vector length being used in the call to + * H5FDread_vector. + */ + + for (i = 0; i < count; i++) { + H5FD_mem_t type_in[ioc_count]; + int64_t data_size[ioc_count]; + int64_t offset_in[ioc_count]; + void * data_in[ioc_count]; + char * databuf = (char *)buf; +#if 0 /* JRM */ + int vectorlen = ioc_count; +#else /* JRM */ + uint32_t vectorlen = (uint32_t)ioc_count; +#endif /* JRM */ + + /* + * Fill vector variables 'data_in' and 'type_in' + */ + for (next = ioc_start, k = 0; k < ioc_count; k++) { + offset_in[k] = sf_offset[next][i]; + type_in[k] = type; + data_in[k] = databuf + source_data_offset[next][i]; + if ((data_size[k] = sf_data_size[next][i]) == 0) { + vectorlen--; + } + next = (next + 1) % ioc_count; + } + + /* And make the read_vector call. Under normal circumstances this + * should invoke H5FD__ioc_read_vector() (see H5FDioc.c) + */ +#if 0 + for (k=0; k < vectorlen; k++) { + printf("%s (%d): v_len=%d, offset=%ld, data_size=%ld\n", + __func__, k,vectorlen, offset_in[k], data_size[k]); + fflush(stdout); + } +#endif +#if 1 /* JRM */ + if (H5FDread_vector(file_ptr->sf_file, dxpl_id, vectorlen, type_in, (uint64_t *)offset_in, + (uint64_t *)data_size, data_in) < 0) { +#else /* JRM */ + if (H5FDread_vector(file_ptr->sf_file, dxpl_id, vectorlen, type_in, offset_in, data_size, + data_in) < 0) { +#endif /* JRM */ + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "IOC file write failed") + } + } + + /* + (*async_file_ptr->h5fdctl)(file_ptr->sf_file, op_code_complete, flags, input, &output); + */ + } + } + else { /* NO STRIPING:: Just a single IOC */ + + /* Check for overflow conditions */ + if (!H5F_addr_defined(addr)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined, addr = %llu", (unsigned long long)addr) + if (REGION_OVERFLOW(addr, size)) + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow, addr = %llu, size = %llu", + (unsigned long long)addr, (unsigned long long)size) + + addr += _file->base_addr; + + /* Follow the example of read_vector (see H5FDint.c) */ + addrs_cooked = TRUE; + + offset = (HDoff_t)addr; +#if 0 /* JRM */ + if (H5FDread_vector(file_ptr->sf_file, dxpl_id, 1, &type, &offset, &size, + &buf) < 0) { +#else /* JRM */ + if (H5FDread_vector(file_ptr->sf_file, dxpl_id, 1, &type, &addr, &size, &buf) < 0) { + +#endif /* JRM */ + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "IOC file write failed") + } +} + +addr += (haddr_t)size; /* Point to the end of the current IO */ + +if (addrs_cooked) + addr -= _file->base_addr; + +/* Update current position and eof */ +file_ptr->pos = addr; +file_ptr->op = OP_READ; +if (file_ptr->pos > file_ptr->eof) + file_ptr->eof = file_ptr->pos; + +done : if (ret_value < 0) +{ + /* Reset last file I/O information */ + file_ptr->pos = HADDR_UNDEF; + file_ptr->op = OP_UNKNOWN; +} /* end if */ + +FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_subfiling_read() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__subfiling_write + * + * Purpose: Writes SIZE bytes of data to FILE beginning at address ADDR + * from buffer BUF according to data transfer properties in + * DXPL_ID. + * + * Return: SUCCEED/FAIL + * + * Programmer: Richard Warren + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUSED dxpl_id, + haddr_t addr, size_t size, const void *buf /*in*/) +{ + H5FD_subfiling_t * file_ptr = (H5FD_subfiling_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + hbool_t addrs_cooked = FALSE; + subfiling_context_t *sf_context = NULL; + int ioc_total, count; + int64_t blocksize; + HDoff_t offset; + + FUNC_ENTER_NOAPI_NOINIT + + HDassert(file_ptr && file_ptr->pub.cls); + HDassert(buf); + + sf_context = (subfiling_context_t *)get__subfiling_object(file_ptr->fa.common.context_id); + HDassert(sf_context); + HDassert(sf_context->topology); + + /* Given the current IO and the IO concentrator info + * we can determine some IO transaction parameters. + * In particular, for large IO operations, each IOC + * may require multiple IOs to fulfill the user IO + * request. The 'max_depth' variable and number of + * IOCs are used to size the vectors that will be + * used to invoke the underlying IO operations. + */ + ioc_total = sf_context->topology->n_io_concentrators; + +#ifdef VERBOSE + if (sf_context->topology->rank_is_ioc) + printf("[%s %d] fd=%d\n", __func__, file_ptr->mpi_rank, sf_context->sf_fid); + else + printf("[%s %d] fd=*\n", __func__, file_ptr->mpi_rank); + fflush(stdout); +#endif + + if (ioc_total > 1) { + size_t max_depth; + blocksize = sf_context->sf_blocksize_per_stripe; +#if 0 /* JRM */ + size_t max_depth = (size_t)(size / blocksize) + 2; +#else /* JRM */ + max_depth = (size_t)(size / (size_t)blocksize) + 2; +#endif /* JRM */ + int next, ioc_count = 0, ioc_start = -1; + + int64_t source_data_offset[ioc_total][max_depth], sf_data_size[ioc_total][max_depth], + sf_offset[ioc_total][max_depth]; + + size_t varsize = sizeof(sf_offset); + + memset(source_data_offset, 0, varsize); + memset(sf_data_size, 0, varsize); + memset(sf_offset, 0, varsize); + + /* Check for overflow conditions */ + if (!H5F_addr_defined(addr)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined, addr = %llu", (unsigned long long)addr) + if (REGION_OVERFLOW(addr, size)) + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow, addr = %llu, size = %llu", + (unsigned long long)addr, (unsigned long long)size) + + addr += _file->base_addr; + +#ifdef VERBOSE + printf("[%s %d] addr=%ld, size=%ld\n", __func__, file_ptr->mpi_rank, addr, size); + fflush(stdout); +#endif + + /* Follow the example of read_vector (see H5FDint.c) */ + addrs_cooked = TRUE; + + offset = (HDoff_t)addr; + + /* Given the number of io concentrators, we allocate vectors (one per-ioc) + * to contain the translation of the IO request into a collection of io + * requests. The translation is accomplished in the init__indep_io function. + */ + + /* Get the potential set of ioc transactions, i.e. data sizes, + * offsets, and datatypes. These can all be used by either the + * underlying IOC or by sec2. + * + * For now, we assume we're dealing with contiguous datasets. + * Vector IO will probably handle the non-contiguous condition + */ +#if 0 /* JRM */ + count = init__indep_io( + sf_context, /* We use the context to look up config info */ + max_depth, ioc_total, source_data_offset, /* (out) Memory offset */ + sf_data_size, /* (out) Length of this contiguous block */ + sf_offset, /* (out) File offset */ + &ioc_start, /* (out) IOC index corresponding to starting offset */ + &ioc_count, /* (out) number of actual IOCs used */ + offset, /* (in) Starting file offset */ + size, /* (in) IO size */ + 1); /* (in) data extent of the 'type' assumes byte */ +#else /* JRM */ + count = init__indep_io(sf_context, /* We use the context to look up config info */ + max_depth, ioc_total, (int64_t *)source_data_offset, /* (out) Memory offset */ + (int64_t *)sf_data_size, /* (out) Length of this contiguous block */ + (int64_t *)sf_offset, /* (out) File offset */ + &ioc_start, /* (out) IOC index corresponding to starting offset */ + &ioc_count, /* (out) number of actual IOCs used */ + offset, /* (in) Starting file offset */ + (int64_t)size, /* (in) IO size */ + 1); /* (in) data extent of the 'type' assumes byte */ +#endif /* JRM */ + + next = ioc_start; + if (count > 0) { + int i, k; + + /* Set ASYNC MODE: + H5FD_class_aio_t *async_file_ptr = (H5FD_class_aio_t *)file_ptr->sf_file; + uint64_t op_code_begin = xxx; + uint64_t op_code_complete = zzz; + const void *input = NULL; + void *output = NULL; + (*async_file_ptr->h5fdctl)(file_ptr->sf_file, op_code_begin, flags, input, + &output); + */ + +#if 0 + printf("[%s] addr=%ld, size=%ld, depth=%d, ioc_count=%d, ioc_start=%d\n", + __func__, offset, size, count, ioc_count, ioc_start); + fflush(stdout); +#endif + /* The 'count' variable captures the max number of IO requests to a single + * IOC whereas the ioc_count is the number of IOC requests per outer loop + * (i) and also represents the vector length being used in the call to + * H5FDwrite_vector. + */ + + for (i = 0; i < count; i++) { + H5FD_mem_t type_in[ioc_count]; + int64_t data_size[ioc_count]; + int64_t offset_in[ioc_count]; +#if 0 /* JRM */ + void *data_in[ioc_count]; +#else /* JRM */ + const void *data_in[ioc_count]; +#endif /* JRM */ + const char *databuf = buf; +#if 0 /* JRM */ + int vectorlen = ioc_count; +#else /* JRM */ + uint32_t vectorlen = (uint32_t)ioc_count; +#endif /* JRM */ + + /* + * Fill vector variables 'data_in' and 'type_in' + */ + for (next = ioc_start, k = 0; k < ioc_count; k++) { + offset_in[k] = sf_offset[next][i]; + type_in[k] = type; + data_in[k] = databuf + source_data_offset[next][i]; + if ((data_size[k] = sf_data_size[next][i]) == 0) { + vectorlen--; + } + next++; + if (next == ioc_total) + next = 0; + } + + /* And make the write_vector call. Under normal circumstances this + * should invoke H5FD__ioc_write_vector() (see H5FDioc.c) + */ +#if 0 + for (k=0; k < vectorlen; k++) { + printf("%s (%d): v_len=%d, offset=%ld, data_size=%ld\n", + __func__, k, vectorlen, offset_in[k], data_size[k]); + fflush(stdout); + } +#endif +#if 0 /* JRM */ + if (H5FDwrite_vector(file_ptr->sf_file, dxpl_id, vectorlen, type_in, + offset_in, data_size, data_in) < 0) { + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "IOC file write failed") + } + } +#else /* JRM */ + + if (H5FDwrite_vector(file_ptr->sf_file, dxpl_id, vectorlen, type_in, (uint64_t *)offset_in, + (uint64_t *)data_size, data_in) < 0) { + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "IOC file write failed") + } + } +#endif /* JRM */ + + /* + (*async_file_ptr->h5fdctl)(file_ptr->sf_file, op_code_complete, flags, input, + &output); + */ + } + } + else { /* NO STRIPING:: Just a single IOC */ + + /* Check for overflow conditions */ + if (!H5F_addr_defined(addr)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addr undefined, addr = %llu", + (unsigned long long)addr) + if (REGION_OVERFLOW(addr, size)) + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, "addr overflow, addr = %llu, size = %llu", + (unsigned long long)addr, (unsigned long long)size) + + addr += _file->base_addr; + + /* Follow the example of read_vector (see H5FDint.c) */ + addrs_cooked = TRUE; + + offset = (HDoff_t)addr; +#if 0 /* JRM */ + if (H5FDwrite_vector(file_ptr->sf_file, dxpl_id, 1, &type, &offset, &size, + &buf) < 0) { +#else /* JRM */ + if (H5FD_write_vector(file_ptr->sf_file, 1, &type, &addr, &size, &buf) < 0) { +#endif /* JRM */ + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "IOC file write failed") + } + } + + addr += (haddr_t)size; /* Point to the end of the current IO */ + + if (addrs_cooked) + addr -= _file->base_addr; + + /* Update current position and eof */ + file_ptr->pos = addr; + file_ptr->op = OP_WRITE; + if (file_ptr->pos > file_ptr->eof) + file_ptr->eof = file_ptr->pos; + +done: + if (ret_value < 0) { + /* Reset last file I/O information */ + file_ptr->pos = HADDR_UNDEF; + file_ptr->op = OP_UNKNOWN; + } /* end if */ + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_subfiling_write() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__subfile_read_vector (internal function) + * + * Purpose: Vector Read function for the sub-filing VFD. + * + * Perform count reads from the specified file at the offsets + * provided in the addrs array, with the lengths and memory + * types provided in the sizes and types arrays. Data read + * is returned in the buffers provided in the bufs array. + * + * All reads are done according to the data transfer property + * list dxpl_id (which may be the constant H5P_DEFAULT). + * + * Return: Success: SUCCEED + * All reads have completed successfully, and + * the results havce been into the supplied + * buffers. + * + * Failure: FAIL + * The contents of supplied buffers are undefined. + * + * Programmer: RAW -- ??/??/21 + * + * Changes: None. + * + * Notes: Thus function doesn't actually implement vector read. + * Instead, it comverts the vector read call into a series + * of scalar read calls. Fix this when time permits. + * + * Also, it didn't support the sizes and types optimization. + * I implemented a version of this which is more generous + * than that currently defined in the RFC. This is good + * enough for now, but the final version should follow + * the RFC. + * JRM -- 10/5/21 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_read_vector(H5FD_t *_file, hid_t dxpl_id, uint32_t count, H5FD_mem_t types[], haddr_t addrs[], + size_t sizes[], void *bufs[] /* out */) +{ + H5FD_subfiling_t *file_ptr = (H5FD_subfiling_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + /* Check arguments + * RAW - Do we really need to check arguments once again? + * These have already been checked in H5FD_subfiling_read_vector (see below)! + */ + if (!file_ptr) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL") + + if ((!types) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "types parameter can't be NULL if count is positive") + + if ((!addrs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addrs parameter can't be NULL if count is positive") + + if ((!sizes) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sizes parameter can't be NULL if count is positive") + + if ((!bufs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs parameter can't be NULL if count is positive") + + /* Get the default dataset transfer property list if the user didn't provide + * one */ + if (H5P_DEFAULT == dxpl_id) { + dxpl_id = H5P_DATASET_XFER_DEFAULT; + } + else { + if (TRUE != H5P_isa_class(dxpl_id, H5P_DATASET_XFER)) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list") + } + + /* Set DXPL for operation */ + H5CX_set_dxpl(dxpl_id); + + /* TODO: setup real support for vector I/O */ + if (file_ptr->fa.require_ioc) { + + hbool_t extend_sizes = FALSE; + hbool_t extend_types = FALSE; + int k; + size_t size; + H5FD_mem_t type; + haddr_t eoa; + + HDassert((count == 0) || (sizes[0] != 0)); + HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST)); + + /* Note that the following code does not let the sub-filing VFD participate + * in collective calls when there is no data to write. This is not an issue + * now, as we don't do anything special with collective operations. However + * this needs to be fixed. + */ + for (k = 0; k < (int)count; k++) { + + if (!extend_sizes) { + + if (sizes[k] == 0) { + + extend_sizes = TRUE; + size = sizes[k - 1]; + } + else { + + size = sizes[k]; + } + } + + if (!extend_types) { + + if (types[k] == H5FD_MEM_NOLIST) { + + extend_types = TRUE; + type = types[k - 1]; + } + else { + + type = types[k]; + } + } + + if (HADDR_UNDEF == (eoa = H5FD__subfiling_get_eoa(_file, type))) + HGOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "driver get_eoa request failed") + + if ((addrs[k] + size) > eoa) + + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, + "addr overflow, addrs[%d] = %llu, sizes[%d] = %llu, eoa = %llu", (int)k, + (unsigned long long)(addrs[k]), (int)k, (unsigned long long)size, + (unsigned long long)eoa) + + if (H5FD__subfiling_read(_file, type, dxpl_id, addrs[k], size, bufs[k]) != SUCCEED) + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "file vector read request failed") + } + } + else { + /* sec2 driver.. + * Call the subfiling 'direct write' version + * of subfiling. + */ + if (H5FD_read_vector(_file, count, types, addrs, sizes, bufs) != SUCCEED) + HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "file vector read request failed") + } + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__subfiling_read_vector() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__subfile_write_vector (internal function) + * + * Purpose: Perform count writes to the specified file at the offsets + * provided in the addrs array. Lengths and memory + * types provided in the sizes and types arrays. Data to be + * written is referenced by the bufs array. + * + * All writes are done according to the data transfer property + * list dxpl_id (which may be the constant H5P_DEFAULT). + * + * Return: Success: SUCCEED + * All writes have completed successfully. + * + * Failure: FAIL + * An internal error was encountered, e.g the + * input arguments are not valid, or the actual + * subfiling writes have failed for some reason. + * + * Programmer: RAW -- ??/??/21 + * + * Changes: None. + * + * Notes: Thus function doesn't actually implement vector write. + * Instead, it comverts the vector write call into a series + * of scalar read calls. Fix this when time permits. + * + * Also, it didn't support the sizes and types optimization. + * I implemented a version of this which is more generous + * than that currently defined in the RFC. This is good + * enough for now, but the final version should follow + * the RFC. + * JRM -- 10/5/21 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_write_vector(H5FD_t *_file, hid_t dxpl_id, uint32_t count, H5FD_mem_t types[], + haddr_t addrs[], size_t sizes[], const void *bufs[] /* in */) +{ + H5FD_subfiling_t *file_ptr = (H5FD_subfiling_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_STATIC + + HDassert(file_ptr != NULL); /* sanity check */ + + /* Check arguments + * RAW - Do we really need to check arguments once again? + * These have already been checked in H5FD_subfiling_write_vector (see below)! + */ + if (!file_ptr) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file pointer cannot be NULL") + + if ((!types) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "types parameter can't be NULL if count is positive") + + if ((!addrs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "addrs parameter can't be NULL if count is positive") + + if ((!sizes) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "sizes parameter can't be NULL if count is positive") + + if ((!bufs) && (count > 0)) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "bufs parameter can't be NULL if count is positive") + + /* Get the default dataset transfer property list if the user didn't provide + * one */ + if (H5P_DEFAULT == dxpl_id) { + dxpl_id = H5P_DATASET_XFER_DEFAULT; + } + else { + if (TRUE != H5P_isa_class(dxpl_id, H5P_DATASET_XFER)) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list") + } + /* Call the subfiling IOC write*/ + if (file_ptr->fa.require_ioc) { + + hbool_t extend_sizes = FALSE; + hbool_t extend_types = FALSE; + int k; + size_t size; + H5FD_mem_t type; + haddr_t eoa; + + HDassert((count == 0) || (sizes[0] != 0)); + HDassert((count == 0) || (types[0] != H5FD_MEM_NOLIST)); + + /* Note that the following code does not let the sub-filing VFD participate + * in collective calls when there is no data to write. This is not an issue + * now, as we don't do anything special with collective operations. However + * this needs to be fixed. + */ + for (k = 0; k < (int)count; k++) { + + if (!extend_sizes) { + + if (sizes[k] == 0) { + + extend_sizes = TRUE; + size = sizes[k - 1]; + } + else { + + size = sizes[k]; + } + } + + if (!extend_types) { + + if (types[k] == H5FD_MEM_NOLIST) { + + extend_types = TRUE; + type = types[k - 1]; + } + else { + + type = types[k]; + } + } + + if (HADDR_UNDEF == (eoa = H5FD__subfiling_get_eoa(_file, type))) + HGOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "driver get_eoa request failed") + + if ((addrs[k] + size) > eoa) + + HGOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL, + "addr overflow, addrs[%d] = %llu, sizes[%d] = %llu, eoa = %llu", (int)k, + (unsigned long long)(addrs[k]), (int)k, (unsigned long long)size, + (unsigned long long)eoa) + + if (H5FD__subfiling_write(_file, type, dxpl_id, addrs[k], size, bufs[k]) != SUCCEED) + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file vector write request failed") + } + } + else { + /* sec2 driver.. + * Call the subfiling 'direct write' version + * of subfiling. + */ + if (H5FD_write_vector(_file, count, types, addrs, sizes, bufs) != SUCCEED) + HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "file vector write request failed") + } +done: + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FDsubfile__write_vector() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_truncate + * + * Purpose: Makes sure that the true file size is the same as + * the end-of-allocation. + * + * Return: SUCCEED/FAIL + * + * Programmer: Richard Warren + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR_UNUSED closing) +{ + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI_NOINIT + + HDassert(file); + + /* Extend the file to make sure it's large enough */ + if (!H5F_addr_eq(file->eoa, file->eof)) { + + /* Update the eof value */ + file->eof = file->eoa; + + /* Reset last file I/O information */ + file->pos = HADDR_UNDEF; + file->op = OP_UNKNOWN; + } /* end if */ + + /* truncate sub-files */ + /* This is a hack. We should be doing the truncate of the sub-files via calls to + * H5FD_truncate() with the IOC. However, that system is messed up at present. + * thus the following hack. + * JRM -- 12/18/21 + */ +#if 1 /* JRM */ + if (H5FD__subfiling__truncate_sub_files(file->eof, file->fa.common.context_id) < 0) + HGOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "sub-file truncate request failed") +#endif /* JRM */ + +done: + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_subfiling_truncate() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_lock + * + * Purpose: To place an advisory lock on a file. + * The lock type to apply depends on the parameter "rw": + * TRUE--opens for write: an exclusive lock + * FALSE--opens for read: a shared lock + * + * Return: SUCCEED/FAIL + * + * Programmer: Vailin Choi; May 2013 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_lock(H5FD_t *_file, hbool_t rw) +{ + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; /* VFD file struct */ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI_NOINIT + + HDassert(file); + if (file->fa.require_ioc) + puts("Subfiling driver doesn't support file locking"); + else { + if (H5FD_lock(file->sf_file, rw) < 0) + HSYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, FAIL, "unable to lock file") + } /* end if */ +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_subfiling_lock() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_unlock + * + * Purpose: To remove the existing lock on the file + * + * Return: SUCCEED/FAIL + * + * Programmer: Vailin Choi; May 2013 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_unlock(H5FD_t *_file) +{ + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; /* VFD file struct */ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI_NOINIT + + HDassert(file); + + if (H5FD_unlock(file->sf_file) < 0) + HSYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, FAIL, "unable to lock file") + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD_subfiling_unlock() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__get_file_ino + * + * Purpose: Given a filename input, we HDstat the file to retrieve + * the inode value. The was principally used for the VOL + * implementation of subfiling + * + * Return: SUCCEED/FAIL + * + * Programmer: Richard Warren + * + *------------------------------------------------------------------------- + */ +herr_t +H5FD__get_file_ino(const char *name, uint64_t *st_ino) +{ + herr_t ret_value = SUCCEED; /* Return value */ + h5_stat_t sb; + + FUNC_ENTER_PACKAGE + + if (HDstat(name, &sb) < 0) + HSYS_GOTO_ERROR(H5E_FILE, H5E_BADFILE, FAIL, "unable to fstat file") + + *st_ino = sb.st_ino; + +done: + + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5FD__get_file_ino() */ + +/*------------------------------------------------------------------------- + * Function: H5FD__subfiling_ctl + * + * Purpose: Subfiling version of the ctl callback. + * + * The desired operation is specified by the op_code + * parameter. + * + * The flags parameter controls management of op_codes that + * are unknown to the callback + * + * The input and output parameters allow op_code specific + * input and output + * + * At present, the supported op codes are: + * + * H5FD_CTL__GET_MPI_COMMUNICATOR_OPCODE + * H5FD_CTL__GET_MPI_RANK_OPCODE + * H5FD_CTL__GET_MPI_SIZE_OPCODE + * + * Note that these opcodes must be supported by all VFDs that + * support MPI. + * + * Return: Non-negative on success/Negative on failure + * + * Programmer: JRM -- 8/3/21 + * + *------------------------------------------------------------------------- + */ +static herr_t +H5FD__subfiling_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void H5_ATTR_UNUSED *input, + void **output) +{ + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + /* Sanity checks */ + HDassert(file); + HDassert(H5FD_SUBFILING == file->pub.driver_id); + + switch (op_code) { + + case H5FD_CTL__GET_MPI_COMMUNICATOR_OPCODE: + HDassert(output); +#if 0 /* JRM */ /* remove eventually */ + if (*output == NULL) { + HDfprintf(stdout, + "H5FD__subfiling_ctl:H5FD_CTL__GET_MPI_COMMUNICATOR_OPCODE: *output is NULL\n"); + } +#endif /* JRM */ /* remove eventually */ + HDassert(*output); + **((MPI_Comm **)output) = file->comm; + break; + + case H5FD_CTL__GET_MPI_RANK_OPCODE: + HDassert(output); + HDassert(*output); +#if 0 /* JRM */ /* remove eventually */ + HDfprintf(stdout, "\nH5FD__subfiling_ctl: rank requested. rank = %d\n", (int)(file->mpi_rank)); + HDfflush(stdout); +#endif /* JRM */ /* remove eventually */ + **((int **)output) = file->mpi_rank; + break; + + case H5FD_CTL__GET_MPI_SIZE_OPCODE: + HDassert(output); + HDassert(*output); +#if 0 /* JRM */ /* remove eventually */ + HDfprintf(stdout, "\nH5FD__subfiling_ctl: size requested. size = %d\n", (int)(file->mpi_size)); + HDfflush(stdout); +#endif /* JRM */ /* remove eventually */ + **((int **)output) = file->mpi_size; + break; + + default: /* unknown op code */ + if (flags & H5FD_CTL__FAIL_IF_UNKNOWN_FLAG) { + + HGOTO_ERROR(H5E_VFL, H5E_FCNTL, FAIL, "unknown op_code and fail if unknown") + } + break; + } + +done: + + FUNC_LEAVE_NOAPI(ret_value) + +} /* end H5FD__subfiling_ctl() */ + +static herr_t +create__simple_vector(hid_t H5_ATTR_UNUSED file_space_id, void *memDataBuf, haddr_t addrBase, + hssize_t elements, size_t type_extent, hssize_t *vlen, haddr_t **_offsets, + hssize_t **_blocklens, void ***_bufs) +{ + haddr_t * offsets = *_offsets; + hssize_t *blocklens = *_blocklens; + void ** bufs = *_bufs; + void * nextBuf = memDataBuf; + + assert(vlen); + assert(_offsets); + assert(_blocklens); + assert(_bufs); + + if (*vlen < 0) { + offsets = (haddr_t *)malloc((sizeof(haddr_t))); + assert(offsets); + + blocklens = (hssize_t *)malloc((sizeof(hssize_t))); + assert(blocklens); + + bufs = (void **)malloc((sizeof(void **))); + assert(bufs); + + bufs[0] = nextBuf; + offsets[0] = addrBase; + blocklens[0] = (hssize_t)((hssize_t)elements * (hssize_t)type_extent); + + if (*vlen < 0) { + *_offsets = offsets; + *_blocklens = blocklens; + *_bufs = bufs; + } + *vlen = 1; + return 0; + } + return -1; +} + +static herr_t +create__vector_from_hyperslab(hid_t file_space_id, void *memDataBuf, haddr_t addrBase, size_t type_extent, + hssize_t *vlen, haddr_t **_offsets, hsize_t **_blocklens, void ***_bufs) +{ + herr_t ret_value = SUCCEED; + hssize_t k, n_blocks = H5Sget_select_hyper_nblocks(file_space_id); + + // USE THIS (when we get around to using calling here). + // htri_t check = H5Sget_regular_hyperslab(file_space_id,) + char *nextBuf = memDataBuf; + + hsize_t stride[H5S_MAX_RANK]; + hsize_t count[H5S_MAX_RANK]; + + hsize_t *strides = stride; + hsize_t *counts = count; + + haddr_t *offsets = *_offsets; + hsize_t *blocklens = *_blocklens; + void ** bufs = *_bufs; + + assert(vlen); + assert(_offsets); + assert(_blocklens); + assert(_bufs); + assert(n_blocks > 0); + + if (n_blocks > H5S_MAX_RANK) { + /* Allocate a temp for the H5Sget_regular_hyperslab function call */ + if ((strides = (hsize_t *)malloc((size_t)n_blocks * sizeof(hsize_t))) == NULL) { + perror("unable to allocate storage for vector creation"); + return -1; + } + if ((counts = (hsize_t *)malloc((size_t)n_blocks * sizeof(hsize_t))) == NULL) { + perror("unable to allocate storage for vector creation"); + return -1; + } + } + + /* Allocate storage for the vector elements */ + if (*vlen < n_blocks) { + if (offsets) { + offsets = (haddr_t *)realloc(offsets, ((size_t)n_blocks * sizeof(haddr_t))); + } + else { + offsets = (haddr_t *)malloc(((size_t)n_blocks * sizeof(haddr_t))); + } + assert(offsets); + if (blocklens) { + blocklens = (hsize_t *)realloc(blocklens, ((size_t)n_blocks * sizeof(hsize_t))); + } + else { + blocklens = (hsize_t *)malloc(((size_t)n_blocks * sizeof(hsize_t))); + } + assert(blocklens); + if (bufs) { + bufs = (void **)realloc(bufs, ((size_t)n_blocks * sizeof(void **))); + } + else { + bufs = (void **)malloc(((size_t)n_blocks * sizeof(void **))); + } + assert(bufs); + *vlen = n_blocks; + } + /* Fill vector elements */ + if ((ret_value = + H5Sget_regular_hyperslab(file_space_id, (hsize_t *)offsets, strides, counts, blocklens)) < 0) { + puts("H5Sget_regular_hyperslab failed"); + return -1; + } + + for (k = 0; k < n_blocks; k++) { + bufs[k] = nextBuf; + offsets[k] *= type_extent; + offsets[k] += addrBase; + blocklens[k] *= type_extent; + nextBuf += (strides[k] * type_extent); + } + if (strides != stride) + free(strides); + if (counts != count) + free(counts); + + *_offsets = offsets; + *_blocklens = blocklens; + *_bufs = bufs; + + return ret_value; +} + +static herr_t +check__dims(int ndims, hsize_t *mem_dims, hsize_t *file_dims, int *diff_index) +{ + int i; + herr_t ret_value = SUCCEED; + for (i = 0; i < ndims; i++) { + if (mem_dims[i] != file_dims[i]) { + *diff_index = i; + return 0; + } + } + /* ndims +1 == no differences */ + *diff_index = i; + return ret_value; +} + +#ifdef UNUSED +static haddr_t +get__data_offset(int mpi_rank, int mpi_size, size_t dtype_extent, const H5S_t *mem_space, + const H5S_t *file_space) +{ + haddr_t this_base = 0; + return this_base; +} +#endif + +static haddr_t +get__base_offset(int mpi_rank, int mpi_size, size_t dtype_extent, hid_t mem_space_id, hid_t file_space_id) +{ + haddr_t this_base = 0; + int n_dims; + int is_simple = H5Sis_simple(file_space_id); + /* The 'is_simple' variable is actually a tri value type: + * -1 == failed + * 0 == NOT_SIMPLE + * 1 == SIMPLE + */ + if (is_simple > 0) { + n_dims = H5Sget_simple_extent_ndims(mem_space_id); + if (n_dims > 0) { + hsize_t mem_stride[n_dims]; + hsize_t mem_dims[n_dims]; + hsize_t file_stride[n_dims]; + hsize_t file_dims[n_dims]; + + if (H5Sget_simple_extent_dims(mem_space_id, mem_dims, mem_stride) < 0) + puts("H5Sget_simple_extent_dims returned an error"); + if (H5Sget_simple_extent_dims(file_space_id, file_dims, file_stride) < 0) + puts("H5Sget_simple_extent_dims returned an error"); + + if (n_dims == 1) { + if (mpi_rank == (mpi_size - 1)) + this_base = (file_dims[0] - mem_dims[0]) * dtype_extent; + else + this_base = (mem_dims[0] * dtype_extent * (hsize_t)mpi_rank); + } + else { + int diff_index = -1; + if (check__dims(n_dims, mem_dims, file_dims, &diff_index) < 0) + puts("check_dims returned an error"); + else { /* CHECK-THIS! What is the correct way? + * if the diff_index isn't 0, then we probably need + * to do the multiplication of the dimensions... + */ + this_base = (mem_dims[diff_index] * (hsize_t)mpi_rank); + } + } + } + } + + return this_base; +} + +herr_t +H5FD__dataset_write_contiguous(hid_t H5_ATTR_UNUSED h5_file_id, haddr_t dataset_baseAddr, size_t dtype_extent, + int mpi_rank, int mpi_size, void H5_ATTR_UNUSED *_dset, + hid_t H5_ATTR_UNUSED mem_type_id, hid_t mem_space_id, hid_t file_space_id, + hid_t H5_ATTR_UNUSED plist_id, const void *buf) +{ + herr_t ret_value = SUCCEED; /* Return value */ + hssize_t num_elem_file = (hssize_t)-1, num_elem_mem = (hssize_t)-1; + hssize_t s_dtype_extent = (hssize_t)dtype_extent; + H5S_sel_type sel_type; + hssize_t sf_vlen = -1; + + const H5S_t *mem_space; + const H5S_t *file_space; + + FUNC_ENTER_PACKAGE + + if ((num_elem_file = H5Sget_select_npoints(file_space_id)) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't get number of points in file selection") + + if ((num_elem_mem = H5Sget_select_npoints(mem_space_id)) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't get number of points in memory selection") + + if (num_elem_file != num_elem_mem) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "number of elements selected" + " in file and memory dataspaces is different") + + if (H5S_get_validated_dataspace(mem_space_id, &mem_space) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "could not get a validated dataspace from mem_space_id") + + if (H5S_get_validated_dataspace(file_space_id, &file_space) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "could not get a validated dataspace from file_space_id") + + if (num_elem_file > 0) { + sel_type = H5Sget_select_type(file_space_id); + switch (sel_type) { + case H5S_SEL_NONE: + printf("[%d] H5S_SEL_NONE\n", mpi_rank); + break; + case H5S_SEL_POINTS: { + haddr_t rank_baseAddr; + rank_baseAddr = + get__base_offset(mpi_rank, mpi_size, dtype_extent, mem_space_id, file_space_id); + rank_baseAddr += dataset_baseAddr; + printf("[%d] H5S_SEL_POINTS - num_elem_file: %lld: UNSUPPORTED (for now)\n", mpi_rank, + num_elem_file); + ret_value = -1; + goto done; + + break; + } + case H5S_SEL_HYPERSLABS: { + int status; + haddr_t rank_baseAddr; + rank_baseAddr = + get__base_offset(mpi_rank, mpi_size, dtype_extent, mem_space_id, file_space_id); + rank_baseAddr += dataset_baseAddr; + + if ((status = H5Sis_regular_hyperslab(file_space_id)) < 0) { + puts("H5Sis_regular_hyperslab returned an error"); + ret_value = -1; + goto done; + } + if (status > 0) { + if (sf_offsets == NULL) + sf_offsets = (haddr_t *)malloc(sizeof(haddr_t)); + if (sf_sizes == NULL) + sf_sizes = (hssize_t *)malloc(sizeof(hssize_t *)); + if (sf_bufs == NULL) + sf_bufs = (void **)malloc(sizeof(void *)); + sf_vlen = 1; + assert(sf_offsets); + assert(sf_sizes); + assert(sf_bufs); + + sf_offsets[0] = rank_baseAddr; + sf_sizes[0] = num_elem_mem * s_dtype_extent; + sf_bufs[0] = buf; + } + break; + } + case H5S_SEL_ALL: { + int status; + haddr_t rank_baseAddr; + rank_baseAddr = + get__base_offset(mpi_rank, mpi_size, dtype_extent, mem_space_id, file_space_id); + rank_baseAddr += dataset_baseAddr; + if (num_elem_mem > 0) { + status = H5Sis_simple(file_space_id); + if (status > 0) { + if (create__simple_vector(file_space_id, buf, rank_baseAddr, num_elem_mem, + dtype_extent, &sf_vlen, &sf_offsets, &sf_sizes, + &sf_bufs) < 0) { + puts("Unable to create simple vectors"); + goto done; + } + } + } + break; + } + default: + printf("[%d] UNSUPPORTED selection type\n", mpi_rank); + ret_value = -1; + } /* END switch (sel_type) */ + + } /* if (num_elem_file > 0) */ + +done: + + FUNC_LEAVE_NOAPI(ret_value) +} + +herr_t +H5FD__dataset_read_contiguous(hid_t H5_ATTR_UNUSED h5_file_id, haddr_t dataset_baseAddr, size_t dtype_extent, + int mpi_rank, int mpi_size, void H5_ATTR_UNUSED *_dset, + hid_t H5_ATTR_UNUSED mem_type_id, hid_t mem_space_id, hid_t file_space_id, + hid_t H5_ATTR_UNUSED plist_id, void *buf) +{ + herr_t ret_value = SUCCEED; /* Return value */ + hssize_t num_elem_file = -1, num_elem_mem = -1; + H5S_sel_type sel_type; + hssize_t sf_vlen = -1; + int status = 0; + + FUNC_ENTER_PACKAGE + if ((num_elem_file = H5Sget_select_npoints(file_space_id)) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't get number of points in file selection") + if ((num_elem_mem = H5Sget_select_npoints(mem_space_id)) < 0) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't get number of points in memory selection") + + if (num_elem_file != num_elem_mem) + HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "number of elements selected" + " in file and memory dataspaces is different") + + if (num_elem_file > 0) { + sel_type = H5Sget_select_type(file_space_id); + switch (sel_type) { + case H5S_SEL_NONE: + // printf("[%d] H5S_SEL_NONE\n", mpi_rank); + break; + case H5S_SEL_POINTS: { + haddr_t rank_baseAddr; + rank_baseAddr = + get__base_offset(mpi_rank, mpi_size, dtype_extent, mem_space_id, file_space_id); + rank_baseAddr += dataset_baseAddr; + // printf("[%d] H5S_SEL_POINTS - num_elem_file: %lld: UNSUPPORTED (for + // now)\n", mpi_rank, num_elem_file); + ret_value = -1; + goto done; + + break; + } + case H5S_SEL_HYPERSLABS: { + haddr_t rank_baseAddr; + const H5S_t *mem_space; + const H5S_t *file_space; + rank_baseAddr = + get__base_offset(mpi_rank, mpi_size, dtype_extent, mem_space_id, file_space_id); + rank_baseAddr += dataset_baseAddr; + if (H5S_get_validated_dataspace(mem_space_id, &mem_space) < 0) { + puts("could not get a validated dataspace from mem_space_id"); + } + if (H5S_get_validated_dataspace(file_space_id, &file_space) < 0) { + puts("could not get a validated dataspace from file_space_id"); + } + + if ((status = H5Sis_regular_hyperslab(file_space_id)) < 0) { + puts("H5Sis_regular_hyperslab returned an error"); + ret_value = -1; + goto done; + } + if (status > 0) { + if (sf_offsets == NULL) + sf_offsets = (haddr_t *)malloc(sizeof(haddr_t)); + if (sf_sizes == NULL) + sf_sizes = (hssize_t *)malloc(sizeof(hsize_t)); + if (sf_bufs == NULL) + sf_bufs = (void **)malloc(sizeof(void *)); + sf_vlen = 1; + assert(sf_offsets); + assert(sf_sizes); + assert(sf_bufs); + + sf_offsets[0] = rank_baseAddr; + sf_sizes[0] = (hssize_t)((hssize_t)num_elem_mem * (hssize_t)dtype_extent); + sf_bufs[0] = buf; + } + break; + } + case H5S_SEL_ALL: { + haddr_t rank_baseAddr; + rank_baseAddr = + get__base_offset(mpi_rank, mpi_size, dtype_extent, mem_space_id, file_space_id); + rank_baseAddr += dataset_baseAddr; + if (num_elem_mem > 0) { + status = H5Sis_simple(file_space_id); + if (status > 0) { + if (create__simple_vector(file_space_id, buf, rank_baseAddr, num_elem_mem, + dtype_extent, &sf_vlen, &sf_offsets, &sf_sizes, + &sf_bufs) < 0) { + puts("Unable to create simple vectors"); + goto done; + } + } + } + break; + } + default: + printf("[%d] UNSUPPORTED selection type\n", mpi_rank); + ret_value = -1; + } /* END switch (sel_type) */ + + } /* if (num_elem_file > 0) */ + +done: + + FUNC_LEAVE_NOAPI(ret_value) +} + +#if 0 /* JRM */ /* delete if all goes well */ +static int H5FD__subfiling_mpi_rank(const H5FD_t *_file) { + const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; + + FUNC_ENTER_STATIC_NOERR + + /* Sanity checks */ + HDassert(file); + + FUNC_LEAVE_NOAPI(file->mpi_rank) +} /* end H5FD__subfiling_mpi_rank() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_mpi_size + * + * Purpose: Returns the number of MPI processes + * + * Return: Success: non-negative + * Failure: negative + * + * Programmer: Quincey Koziol + * Thursday, May 16, 2002 + * + *------------------------------------------------------------------------- + */ +static int H5FD__subfiling_mpi_size(const H5FD_t *_file) { + const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; + + FUNC_ENTER_STATIC_NOERR + + /* Sanity checks */ + HDassert(file); + + FUNC_LEAVE_NOAPI(file->mpi_size) +} /* end H5FD__subfiling_mpi_size() */ + +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_communicator + * + * Purpose: Returns the MPI communicator for the file. + * + * Return: Success: The communicator + * Failure: Can't fail + * + * Programmer: Richard Warren + * + *------------------------------------------------------------------------- + */ +static MPI_Comm H5FD__subfiling_communicator(const H5FD_t *_file) { + const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file; + + FUNC_ENTER_STATIC_NOERR + + /* Sanity checks */ + HDassert(file); + + FUNC_LEAVE_NOAPI(file->comm) +} /* end H5FD__subfiling_communicator() */ + +#endif /* JRM */ /* delete if all goes well */ + +#if 0 /* JRM */ /* unused?? delete if so */ +/*------------------------------------------------------------------------- + * Function: H5FD_subfiling_get_info + * + * Purpose: Returns the file info of SUBFILING file driver. + * + * Returns: Non-negative if succeed or negative if fails. + * + * Programmer: John Mainzer + * April 4, 2017 + * + *------------------------------------------------------------------------- + */ +static herr_t H5FD__subfiling_get_info(H5FD_t *_file, void **mpi_info) { + H5FD_subfiling_t *file = (H5FD_subfiling_t *)_file; + herr_t ret_value = SUCCEED; + + FUNC_ENTER_STATIC + + if (!mpi_info) + HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "mpi info not valid") + + *mpi_info = &(file->info); + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* H5FD__subfiling_get_info() */ + +#endif /* JRM */ + +void +manage_client_logfile(int H5_ATTR_UNUSED client_rank, int H5_ATTR_UNUSED flag_value) +{ +#ifndef NDEBUG + if (flag_value) { + char logname[64]; + sprintf(logname, "sf_client_%d.log", client_rank); + client_log = fopen(logname, "a+"); + } + else if (client_log) { + fclose(client_log); + client_log = 0; + } +#endif + return; +} diff --git a/src/H5FDsubfiling.h b/src/H5FDsubfiling.h new file mode 100644 index 00000000000..5b17d6e9bc7 --- /dev/null +++ b/src/H5FDsubfiling.h @@ -0,0 +1,281 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Programmer: Robb Matzke + * Monday, August 2, 1999 + * + * Purpose: The public header file for the subfiling driver. + */ +#ifndef H5FDsubfiling_H +#define H5FDsubfiling_H + +#define H5FD_SUBFILING (H5FD_subfiling_init()) +#define H5FD_SUBFILING_VALUE H5_VFD_SUBFILING + +#if 1 /* JRM */ /* For now, H5FDsubfiling_priv.h needs mercury. Since the code that needs it will \ + * move to its own header, just hack it for now. \ + */ +#include "mercury_thread.h" +#include "mercury_thread_mutex.h" +#include "mercury_thread_pool.h" +#endif /* JRM */ + +#include "H5FDsubfiling_priv.h" + +#ifndef H5FD_SUBFILING_FAPL_T_MAGIC +#define H5FD_CURR_SUBFILING_FAPL_T_VERSION 1 +#define H5FD_SUBFILING_FAPL_T_MAGIC 0xFED01331 +#endif + +/**************************************************************************** + * + * Structure: H5FD_subfiling_fapl_t + * + * Purpose: + * + * H5FD_subfiling_fapl_t is a public structure that is used to pass + * subfiling configuration data to the appropriate subfiling VFD via + * the FAPL. A pointer to an instance of this structure is a parameter + * to H5Pset_fapl_subfiling() and H5Pget_fapl_subfiling(). + * + * `magic` (uint32_t) + * + * Magic is a somewhat unique number which identifies this VFD from + * other VFDs. Used in combination with a version number, we can + * validate a user generated file access property list (fapl). + * This field should be set to H5FD_SUBFILING_FAPL_T_MAGIC. + * + * `version` (uint32_t) + * + * Version number of the H5FD_subfiling_fapl_t structure. Any instance + * passed to the above calls must have a recognized version number, or + * an error will be flagged. + * + * This field should be set to H5FD_CURR_SUBFILING_FAPL_T_VERSION. + * + *** IO Concentrator Info *** + *** These fields will be replicated in the stacked IOC VFD which + *** provides the extended support for aggregating reads and writes + *** and allows global file access to node-local storage containers. + * + * `stripe_count` (int32_t) + * + * The integer value which identifies the total number of + * subfiles that have been algorithmically been selected to + * to contain the segments of raw data which make up an HDF5 + * file. This value is used to implement the RAID-0 functionality + * when reading or writing datasets. + * + * `stripe_depth` (int64_t) + * + * The stripe depth defines a limit on the maximum number of contiguous + * bytes that can be read or written in a single operation on any + * selected subfile. Larger IO operations can exceed this limit + * by utilizing MPI derived types to construct an IO request which + * gathers additional data segments from memory for the IO request. + * + * `ioc_selection` (enum io_selection datatype) + * + * The io_selection_t defines a specific algorithm by which IO + * concentrators (IOCs) and sub-files are identified. The available + * algorithms are: SELECT_IOC_ONE_PER_NODE, SELECT_IOC_EVERY_NTH_RANK, + * SELECT_IOC_WITH_CONFIG, and SELECT_IOC_TOTAL. + * + *** STACKING and other VFD support + *** i.e. FAPL caching + *** + * + * `ioc_fapl_id` (hid_t) + * + * A valid file access property list (fapl) is cached on each + * process and thus enables selection of an alternative provider + * for subsequent file operations. + * By default, Sub-filing employs an additional support VFD that + * provides file IO proxy capabilities to all MPI ranks in a + * distributed parallel application. This IO indirection + * thus allows application access all sub-files even while + * these may actually be node-local and thus not directly + * accessible to remote ranks. + * + *** Subfiling file Info + * + * `subfile_dir` char[] + * + * A file directory name where subfiling files should be + * placed. Under normal circumstances, this directory name + * should match the directory path of the user defined HDF5 + * file. + * + * `subfile_path` char[] + * + * The full pathname of the user HDF5 file. + * + +WARNING -- this code is commented out + +#define H5FD_SUBFILING_PATH_MAX 4096 + +typedef struct config_common_t { + uint32_t magic; + uint32_t version; + int32_t stripe_count; + int64_t stripe_depth; + ioc_selection_t ioc_selection; + hid_t ioc_fapl_id; + char subfile_dir[H5FD_SUBFILING_PATH_MAX +1]; + char subfile_path[H5FD_SUBFILING_PATH_MAX +1]; + char h5_filename[H5FD_SUBFILING_PATH_MAX +1]; +} config_common_t; + + ****************************************************************************/ + +/* + * In addition to the common configuration fields, we can have + * VFD specific fields. Here's one for the subfiling VFD. + * + * `require_ioc` (hbool_t) + * + * Require_IOC is a boolean flag with a default value of TRUE. + * This flag indicates that the stacked H5FDioc VFD should be + * employed for sub-filing operations. The default flag can be + * overridden with an environment variable: H5_REQUIRE_IOC=0 + * + */ + +//! +/** + * Configure struct for H5Pget_fapl_subfiling() / H5Pset_fapl_subfiling() + */ +typedef struct H5FD_subfiling_config_t { + config_common_t common; + hbool_t require_ioc; +} H5FD_subfiling_config_t; +//! + +#ifdef __cplusplus +extern "C" { +#endif + +extern FILE *sf_logfile; +extern FILE *client_log; + +H5_DLL hid_t H5FD_subfiling_init(void); +/** + * \ingroup FAPL + * + * \brief Queries subfiling file driver properties + * + * \fapl_id + * \param[out] config_out The subfiling fapl data. + * + * \returns \herr_t + * + * \details H5Pget_fapl_subfiling() queries the #H5FD_SUBFILING driver properties as set + * by H5Pset_fapl_subfiling(). + * + * \since 1.14.0 + * + */ +H5_DLL herr_t H5Pget_fapl_subfiling(hid_t fapl_id, H5FD_subfiling_config_t *config_out); +/** + * \ingroup FAPL + * + * \brief Modifies the file access property list to use the #H5FD_SUBFILING driver + * + * \fapl_id + * \param[in] vfd_config #H5FD_SUBFILING driver specific properties. If NULL, then + * the IO concentrator VFD will be used. + * \returns \herr_t + * + * \details H5Pset_fapl_core() modifies the file access property list to use the + * #H5FD_SUBFILING driver. + * + * \todo Expand details! + * + * \since 1.14.0 + * + */ +H5_DLL herr_t H5Pset_fapl_subfiling(hid_t fapl_id, H5FD_subfiling_config_t *vfd_config); +H5_DLL herr_t H5FD__get_file_ino(const char *name, uint64_t *st_ino); +H5_DLL char * H5FD__get_file_directory(void *h5file); +H5_DLL herr_t H5FD__dataset_write_contiguous(hid_t h5_file_id, haddr_t dataset_baseAddr, size_t dtype_extent, + int mpi_rank, int mpi_size, void *_dset, hid_t mem_type_id, + hid_t mem_space_id, hid_t file_space_id, hid_t plist_id, + const void *buf); +H5_DLL herr_t H5FD__dataset_read_contiguous(hid_t h5_file_id, haddr_t dataset_baseAddr, size_t dtype_extent, + int mpi_rank, int mpi_size, void *_dset, hid_t mem_type_id, + hid_t mem_space_id, hid_t file_space_id, hid_t plist_id, + void *buf); + +H5_DLL char *get_ioc_selection_criteria(ioc_selection_t *); +H5_DLL void *get__subfiling_object(int64_t object_id); +H5_DLL hid_t fid_map_to_context(uint64_t h5_fid); + +/* return arguments are vector of vectors - function return is the length + * (depth) of the sub vectors. Note that we don't need to include the + * MPI_Datatype return argument! + */ +H5_DLL int subfiling_open_file(sf_work_request_t *msg, int subfile_rank, int flags); + +H5_DLL int init__indep_io(void *_sf_context, size_t depth, int ioc_total, int64_t *sf_source_data_offset, + int64_t *sf_datasize, int64_t *f_offset, int *first_index, int *n_containers, + int64_t offset, int64_t elements, int dtype_extent); + +H5_DLL int H5FD__open_subfiles(void *_config_info, uint64_t inode_id, int flags); +H5_DLL int H5FD__close_subfiles(hid_t context_id); +H5_DLL int H5FD__read_independent(hid_t H5FD__fid, int64_t offset, int64_t elements, int dtype_extent, + void *data); +H5_DLL int H5FD__write_independent(hid_t H5FD__fid, int64_t offset, int64_t elements, int dtype_extent, + const void *data); +H5_DLL herr_t H5FD__read_vector(hid_t h5_fid, hssize_t count, haddr_t *addrs, hsize_t sizes[], + void *bufs[] /* in */); +H5_DLL herr_t H5FD__write_vector(hid_t h5_fid, hssize_t count, haddr_t *addrs, hsize_t sizes[], + void *bufs[] /* in */); +H5_DLL int H5FD__truncate(hid_t h5_fid, haddr_t addr); +H5_DLL int H5FD__shutdown_local_ioc(hid_t fid); +H5_DLL void manage_client_logfile(int client_rank, int flag_value); +#if 0 /* JRM */ +H5_DLL int initialize_ioc_threads(void *sf_context); +#endif /* JRM */ +H5_DLL herr_t H5FD__write_vector_internal(hid_t h5_fid, hssize_t count, haddr_t addrs[], size_t sizes[], + const void *bufs[] /* data_in */); + +H5_DLL herr_t H5FD__read_vector_internal(hid_t h5_fid, hssize_t count, haddr_t addrs[], size_t sizes[], + void *bufs[] /* data_out */); +#if 0 /* JRM */ +H5_DLL int queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); +#else /* JRM */ +H5_DLL int queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm, + int counter); +#endif /* JRM */ + +H5_DLL int queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); + +H5_DLL int sf_read_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank); + +H5_DLL int sf_write_data(int fd, int64_t file_offset, void *data_buffer, int64_t data_size, int subfile_rank); + +H5_DLL int sf_truncate(int fd, int64_t length, int subfile_rank); + +H5_DLL herr_t H5FD__subfiling__truncate_sub_files(int64_t logical_file_eof, hid_t context_id); + +H5_DLL int report_sf_eof(sf_work_request_t *msg, int subfile_rank, int source, MPI_Comm comm); + +H5_DLL herr_t H5FD__subfiling__get_real_eof(int64_t *logical_eof_ptr, hid_t context_id); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/H5FDsubfiling_priv.h b/src/H5FDsubfiling_priv.h new file mode 100644 index 00000000000..b28b58e5487 --- /dev/null +++ b/src/H5FDsubfiling_priv.h @@ -0,0 +1,772 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Purpose: Public, shared definitions for Mirror VFD & remote Writer. + */ + +#ifndef H5FDsubfiling_priv_H +#define H5FDsubfiling_priv_H + +/********************/ +/* Standard Headers */ +/********************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/**************/ +/* H5 Headers */ +/**************/ +#include "H5CXprivate.h" /* API Contexts */ +#include "H5Dprivate.h" /* Datasets */ +#include "H5Eprivate.h" /* Error handling */ +#include "H5Iprivate.h" /* IDs */ +#include "H5Ipublic.h" +#include "H5MMprivate.h" /* Memory management */ +#include "H5Pprivate.h" /* Property lists */ +#include "H5private.h" /* Generic Functions */ +#include "H5FDioc.h" + +#include "mpi.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/**************************************************************************** + * + * Structure: H5FD_subfiling_fapl_t + * + * Purpose: + * + * H5FD_subfiling_fapl_t is a public structure that is used to pass + * subfiling configuration data to the appropriate subfiling VFD via + * the FAPL. A pointer to an instance of this structure is a parameter + * to H5Pset_fapl_subfiling() and H5Pget_fapl_subfiling(). + * + * `magic` (uint32_t) + * + * Magic is a somewhat unique number which identifies this VFD from + * other VFDs. Used in combination with a version number, we can + * validate a user generated file access property list (fapl). + * This field should be set to H5FD_SUBFILING_FAPL_T_MAGIC. + * + * `version` (uint32_t) + * + * Version number of the H5FD_subfiling_fapl_t structure. Any instance + * passed to the above calls must have a recognized version number, or + * an error will be flagged. + * + * This field should be set to H5FD_CURR_SUBFILING_FAPL_T_VERSION. + * + *** IO Concentrator Info *** + *** These fields will be replicated in the stacked IOC VFD which + *** provides the extended support for aggregating reads and writes + *** and allows global file access to node-local storage containers. + * + * `stripe_count` (int32_t) + * + * The integer value which identifies the total number of + * subfiles that have been algorithmically been selected to + * to contain the segments of raw data which make up an HDF5 + * file. This value is used to implement the RAID-0 functionality + * when reading or writing datasets. + * + * `stripe_depth` (int64_t) + * + * The stripe depth defines a limit on the maximum number of contiguous + * bytes that can be read or written in a single operation on any + * selected subfile. Larger IO operations can exceed this limit + * by utilizing MPI derived types to construct an IO request which + * gathers additional data segments from memory for the IO request. + * + * `ioc_selection` (enum io_selection datatype) + * + * The io_selection_t defines a specific algorithm by which IO + * concentrators (IOCs) and sub-files are identified. The available + * algorithms are: SELECT_IOC_ONE_PER_NODE, SELECT_IOC_EVERY_NTH_RANK, + * SELECT_IOC_WITH_CONFIG, and SELECT_IOC_TOTAL. + * + *** STACKING and other VFD support + *** i.e. FAPL caching + *** + * + * `ioc_fapl_id` (hid_t) + * + * A valid file access property list (fapl) is cached on each + * process and thus enables selection of an alternative provider + * for subsequent file operations. + * By default, Sub-filing employs an additional support VFD that + * provides file IO proxy capabilities to all MPI ranks in a + * distributed parallel application. This IO indirection + * thus allows application access all sub-files even while + * these may actually be node-local and thus not directly + * accessible to remote ranks. + * + *** Subfiling file Info + * + * `subfile_dir` char[] + * + * A file directory name where subfiling files should be + * placed. Under normal circumstances, this directory name + * should match the directory path of the user defined HDF5 + * file. + * + * `subfile_path` char[] + * + * The full pathname of the user HDF5 file. + * + ****************************************************************************/ + +#ifndef H5FD_SUBFILING_FAPL_T_MAGIC +#define H5FD_CURR_SUBFILING_FAPL_T_VERSION 1 +#define H5FD_SUBFILING_FAPL_T_MAGIC 0xFED01331 +#endif + +#ifndef H5FD_IOC_FAPL_T_MAGIC +#define H5FD_CURR_IOC_FAPL_T_VERSION 1 +#define H5FD_IOC_FAPL_T_MAGIC 0xFED21331 +#endif + +#define DRIVER_INFO_MESSAGE_MAX_INFO 65536 +#define DRIVER_INFO_MESSAGE_MAX_LENGTH 65552 /* MAX_INFO + sizeof(info_header_t) */ + +#define K(n) ((n)*1024) +#define M(n) ((n) * (1024 * 1024)) +#define H5FD_DEFAULT_STRIPE_DEPTH M(32) + +typedef struct stat_record { + int64_t op_count; /* How many ops in total */ + double min; /* minimum (time) */ + double max; /* maximum (time) */ + double total; /* average (time) */ +} stat_record_t; + +typedef enum stat_category { /* Stat (OP) Categories */ + WRITE_STAT = 0, + WRITE_WAIT, + READ_STAT, + READ_WAIT, + FOPEN_STAT, + FCLOSE_STAT, + QUEUE_STAT, + TOTAL_STAT_COUNT +} stat_category_t; + +typedef struct _info_header { /* Header for a driver info message */ + uint8_t version; + uint8_t unused_1; + uint8_t unused_2; + uint8_t unused_3; /* Actual info message length, but */ + int32_t info_length; /* CANNOT exceed 64k (65552) bytes */ + char vfd_key[8]; /* 's' 'u' 'b' 'f' 'i' 'l' 'i' 'n' */ +} info_header_t; + +/* THE following definitions are used between H5FDsubfile_mpi.c + * and H5FDioc_threads.c + * + * MPI Tags are 32 bits, we treat them as unsigned + * to allow the use of the available bits for RPC + * selections, i.e. a message from the VFD read or write functions + * to an IO Concentrator. The messages themselves are in general + * ONLY 3 int64_t values which define a) the data size to be read + * or written, b) the file offset where the data will be read from + * or stored, and c) the context_id allows the IO concentrator to + * locate the IO context for the new IO transaction. + * + * 0000 + * 0001 READ_OP (Independent) + * 0010 WRITE_OP (Independent) + * 0011 ///////// + * 0100 CLOSE_OP (Independent) + * ----- + * 1000 + * 1001 COLLECTIVE_READ + * 1010 COLLECTIVE_WRITE + * 1011 ///////// + * 1100 COLLECTIVE_CLOSE + * + * 31 28 24 20 16 12 8 4 0| + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | | | ACKS | OP | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + */ + +/* Bit 3 SET indicates collectives */ +#define COLL_FUNC (0x1 << 3) + +#if 0 /* JRM */ /* original version */ + +#define ACK_PART (0x0acc << 8) +#define DATA_PART (0xd8da << 8) +#define READY (0xfeed << 8) +#define COMPLETED (0xfed1 << 8) + +#else /* JRM */ /* reduce size to make space for counters to disambiguate multiple concurrent requests from \ + same rank */ + +#define ACK_PART (0x01 << 8) +#define DATA_PART (0x02 << 8) +#define READY (0x04 << 8) +#define COMPLETED (0x08 << 8) + +#endif /* JRM */ /* reduce size to make space for counters to disambiguate multiple concurrent requests from \ + same rank */ + +#define READ_INDEP (READ_OP) +#define READ_COLL (COLL_FUNC | READ_OP) +#define WRITE_INDEP (WRITE_OP) +#define WRITE_COLL (COLL_FUNC | WRITE_OP) + +#define WRITE_INDEP_ACK (ACK_PART | WRITE_OP) +#define WRITE_INDEP_DATA (DATA_PART | WRITE_OP) + +#define READ_INDEP_DATA (DATA_PART | READ_OP) + +#define GET_EOF_COMPLETED (COMPLETED | GET_EOF_OP) + +#define SET_LOGGING (LOGGING_OP) + +#define INT32_MASK 0x07FFFFFFFFFFFFFFF + +/* The following are the basic 'op codes' used when + * constructing a RPC message for IO Concentrators. + * These are defined in the low 8 bits of the + * message. + * + * We currently ONLY use READ_OP and WRITE_OP + * + * Added TRUNC_OP 12/15/21 -- JRM + * + * Added GET_EOF_OP 12/28/21 -- JRM + */ +typedef enum io_ops { + READ_OP = 1, + WRITE_OP = 2, + OPEN_OP = 3, + CLOSE_OP = 4, + TRUNC_OP = 5, + GET_EOF_OP = 6, + FINI_OP = 8, + LOGGING_OP = 16 +} io_op_t; + +/* Here are the basic key values to be used when accessing + * the cache of stored topologies or contexts. + */ +typedef enum { + SF_BADID = (-1), + SF_TOPOLOGY = 1, + SF_CONTEXT = 2, + SF_NTYPES /* number of subfiling object types, MUST BE LAST */ +} sf_obj_type_t; + +/* Every application rank will record their MPI rank + * and hostid as a structure. These eventually get + * communicated to MPI rank zero(0) and sorted before + * being broadcast. The resulting sorted vector + * provides a basis for determining which MPI ranks + * will host an IO Concentrator (IOC), e.g. For + * default behavior, we choose the first vector entry + * associated with a "new" hostid. + */ +typedef struct { + long rank; + long hostid; +} layout_t; + +/* This typedef defines a fixed process layout which + * can be reused for any number of file open operations + */ +typedef struct app_layout_t { + long hostid; /* value returned by gethostid() */ + layout_t *layout; /* Vector of {rank,hostid} values */ + int * node_ranks; /* ranks extracted from sorted layout */ + int node_count; /* Total nodes (different hostids) */ + int node_index; /* My node: index into node_ranks */ + int local_peers; /* How may local peers on my node */ + int world_rank; /* My MPI rank */ + int world_size; /* Total number of MPI ranks */ +} app_layout_t; + +/* This typedef defines things related to IOC selections */ +typedef struct topology { + app_layout_t * app_layout; /* Pointer to our layout struct */ + bool rank_is_ioc; /* Indicates that we host an IOC */ + int subfile_rank; /* Valid only if rank_is_ioc */ + int n_io_concentrators; /* Number of IO concentrators */ + int * io_concentrator; /* Vector of ranks which are IOCs */ + int * subfile_fd; /* file descriptor (if IOC) */ + ioc_selection_t selection_type; /* Cache our IOC selection criteria */ +} sf_topology_t; + +typedef struct { + hid_t sf_context_id; /* Generated context ID which embeds the cache index */ + uint64_t h5_file_id; /* GUID (basically the inode value) */ + int sf_fid; /* value returned by open(file,..) */ + size_t sf_write_count; /* Statistics: write_count */ + size_t sf_read_count; /* Statistics: read_count */ + haddr_t sf_eof; /* File eof */ + int64_t sf_stripe_size; /* Stripe-depth */ + int64_t sf_blocksize_per_stripe; /* Stripe-depth X n_IOCs */ + int64_t sf_base_addr; /* For an IOC, our base address */ + MPI_Comm sf_msg_comm; /* MPI comm used to send RPC msg */ + MPI_Comm sf_data_comm; /* MPI comm used to move data */ + MPI_Comm sf_group_comm; /* Not used: for IOC collectives */ + MPI_Comm sf_intercomm; /* Not used: for msgs to all IOC */ + int sf_group_size; /* IOC count (in sf_group_comm) */ + int sf_group_rank; /* IOC rank (in sf_group_comm) */ + int sf_intercomm_root; /* Not used: for IOC comms */ + char * subfile_prefix; /* If subfiles are node-local */ + char * sf_filename; /* A generated subfile name */ + char * h5_filename; /* The user supplied file name */ + sf_topology_t *topology; /* pointer to our topology */ + +} subfiling_context_t; + +/* The following is a somewhat augmented input (by the IOC) which captures + * the basic RPC from a 'source'. The fields are filled out to allow + * an easy gathering of statistics by the IO Concentrator. + */ +typedef struct { + /* {Datasize, Offset, FileID} */ + int64_t header[3]; /* The basic RPC input plus */ + int tag; /* the supplied OPCODE tag */ + int source; /* Rank of who sent the message */ + int subfile_rank; /* The IOC rank */ + hid_t context_id; /* context to be used to complete */ + double start_time; /* the request, + time of receipt */ + /* from which we calc Time(queued) */ + void *buffer; /* for writes, we keep the buffer */ + /* around for awhile... */ + volatile int in_progress; /* Not used! */ + volatile int serialize; /* worker thread needs to wait while true */ + volatile int dependents; //* If current work item has dependents */ + int depend_id; /* work queue index of the dependent */ +} sf_work_request_t; + +typedef struct { /* Format of a context map entry */ + uint64_t h5_file_id; /* key value (linear search of the cache) */ + hid_t sf_context_id; /* The return value if matching h5_file_id */ +} file_map_to_context_t; + +/* + * CAUTION:: + * Do we want or need this? + * Unfortunately, this structure is ONLY defined + * in the H5FDsec2.c source file... + * I'm only using it to access the file descriptor to + * allow me to get the inode info. + */ +typedef struct H5FD_sec2_t { + H5FD_t pub; /* public stuff, must be first */ + int fd; /* the filesystem file descriptor */ +} H5FD_sec2_t; + +extern int sf_verbose_flag; +extern atomic_int sf_work_pending; +extern atomic_int sf_file_open_count; +extern atomic_int sf_file_close_count; +extern atomic_int sf_shutdown_flag; +extern atomic_int sf_io_ops_pending; +extern atomic_int sf_ioc_ready; + +#if 1 /* JRM */ /* this belongs in an IOC private header file */ + +#define H5FD_IOC__COLLECT_STATS TRUE + +/**************************************************************************** + * + * IOC I/O Queue management macros: + * + * The following macros perform the necessary operations on the IOC I/O + * Queue, which is implemented as a doubly linked list of instances of + * H5FD_ioc_io_queue_entry_t. + * + * WARNING: q_ptr->q_mutex must be held when these macros are executed.. + * + * At present, the necessary operations are append (insert an entry at the + * end of the queue), and delete (remove an entry from the queue). + * + * At least initially, all sanity checking is done with asserts, as the + * the existing I/O concentrator code is not well integrated into the HDF5 + * error reporting system. This will have to be revisited for a production + * version, but it should be sufficient for now. + * + * JRM -- 11/2/21 + * + ****************************************************************************/ + +/* clang-format off */ + +#define H5FD_IOC__Q_APPEND(q_ptr, entry_ptr) \ +do { \ + HDassert(q_ptr); \ + HDassert((q_ptr)->magic == H5FD_IOC__IO_Q_MAGIC); \ + HDassert((((q_ptr)->q_len == 0) && ((q_ptr)->q_head == NULL) && ((q_ptr)->q_tail == NULL)) || \ + (((q_ptr)->q_len > 0) && ((q_ptr)->q_head != NULL) && ((q_ptr)->q_tail != NULL))); \ + HDassert(entry_ptr); \ + HDassert((entry_ptr)->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC); \ + HDassert((entry_ptr)->next == NULL); \ + HDassert((entry_ptr)->prev == NULL); \ + HDassert((entry_ptr)->in_progress == FALSE); \ + \ + if ( ((q_ptr)->q_head) == NULL ) \ + { \ + ((q_ptr)->q_head) = (entry_ptr); \ + ((q_ptr)->q_tail) = (entry_ptr); \ + } \ + else \ + { \ + ((q_ptr)->q_tail)->next = (entry_ptr); \ + (entry_ptr)->prev = ((q_ptr)->q_tail); \ + ((q_ptr)->q_tail) = (entry_ptr); \ + } \ + ((q_ptr)->q_len)++; \ +} while ( FALSE ) /* H5FD_IOC__Q_APPEND() */ + +#define H5FD_IOC__Q_REMOVE(q_ptr, entry_ptr) \ +do { \ + HDassert(q_ptr); \ + HDassert((q_ptr)->magic == H5FD_IOC__IO_Q_MAGIC); \ + HDassert((((q_ptr)->q_len == 1) && ((q_ptr)->q_head ==((q_ptr)->q_tail)) && ((q_ptr)->q_head == (entry_ptr))) || \ + (((q_ptr)->q_len > 0) && ((q_ptr)->q_head != NULL) && ((q_ptr)->q_tail != NULL))); \ + HDassert(entry_ptr); \ + HDassert((entry_ptr)->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC); \ + HDassert((((q_ptr)->q_len == 1) && ((entry_ptr)->next == NULL) && ((entry_ptr)->prev == NULL)) || \ + (((q_ptr)->q_len > 1) && (((entry_ptr)->next != NULL) || ((entry_ptr)->prev != NULL)))); \ + HDassert((entry_ptr)->in_progress == TRUE); \ + \ + { \ + if ( (((q_ptr)->q_head)) == (entry_ptr) ) \ + { \ + (((q_ptr)->q_head)) = (entry_ptr)->next; \ + if ( (((q_ptr)->q_head)) != NULL ) \ + (((q_ptr)->q_head))->prev = NULL; \ + } \ + else \ + { \ + (entry_ptr)->prev->next = (entry_ptr)->next; \ + } \ + if (((q_ptr)->q_tail) == (entry_ptr) ) \ + { \ + ((q_ptr)->q_tail) = (entry_ptr)->prev; \ + if ( ((q_ptr)->q_tail) != NULL ) \ + ((q_ptr)->q_tail)->next = NULL; \ + } \ + else \ + { \ + (entry_ptr)->next->prev = (entry_ptr)->prev; \ + } \ + (entry_ptr)->next = NULL; \ + (entry_ptr)->prev = NULL; \ + ((q_ptr)->q_len)--; \ + } \ +} while ( FALSE ) /* H5FD_IOC__Q_REMOVE() */ + +/* clang-format on */ + +/**************************************************************************** + * + * structure H5FD_ioc_io_queue_entry + * + * magic: Unsigned 32 bit integer always set to H5FD_IOC__IO_Q_ENTRY_MAGIC. + * This field is used to validate pointers to instances of + * H5FD_ioc_io_queue_entry_t. + * + * next: Next pointer in the doubly linked list used to implement + * the IOC I/O Queue. This field points to the next entry + * in the queue, or NULL if there is no next entry. + * + * prev: Prev pointer in the doubly linked list used to implement + * the IOC I/O Queue. This field points to the previous entry + * in the queue, or NULL if there is no previous entry. + * + * in_progress: Boolean flag that must be FALSE when the entry is inserted + * into the IOC I/O Queue, and set to TRUE when the entry is dispatched + * to the worker thread pool for execution. + * + * When in_progress is FALS, the entry is said to be pending. + * + * counter: uint32_t containing a serial number assigned to this IOC + * I/O Queue entry. Note that this will roll over on long + * computations, and thus is not in general unique. + * + * The counter fields is used to construct a tag to distinguish + * multiple concurrent I/O requests from a give rank, and thus + * this should not be a problem as long as there is sufficient + * time between roll overs. As only the lower bits of the counter + * are used in tag construction, this is more frequent than the + * size of the counter field would suggest -- albeit hopefully + * still infrequent enough. + * + * wk_req: Instance of sf_work_request_t. Replace with individual + * fields when convenient. + * + * + * Statistics: + * + * The following fields are only defined if H5FD_IOC__COLLECT_STATS is TRUE. + * They are intended to allow collection of basic statistics on the + * behaviour of the IOC I/O Queue for purposes of debugging and performance + * optimization. + * + * q_time: uint64_t containing the time the entry was place on the + * IOC I/O Queue in usec after the UNIX epoch. + * + * This value is used to compute the queue wait time, and the + * total processing time for the entry. + * + * dispatch_time: uint64_t containing the time the entry is dispatched in + * usec after the UNIX epoch. This field is undefined if the + * entry is pending. + * + * This value is used to compute the execution time for the + * entry. + * + ****************************************************************************/ + +#define H5FD_IOC__IO_Q_ENTRY_MAGIC 0x1357 + +typedef struct H5FD_ioc_io_queue_entry { + + uint32_t magic; + struct H5FD_ioc_io_queue_entry *next; + struct H5FD_ioc_io_queue_entry *prev; + hbool_t in_progress; + uint32_t counter; + + /* rework these fields */ /* JRM */ + sf_work_request_t wk_req; + struct hg_thread_work thread_wk; + + /* statistics */ +#if H5FD_IOC__COLLECT_STATS + + uint64_t q_time; + uint64_t dispatch_time; + +#endif /* H5FD_IOC__COLLECT_STATS */ + +} H5FD_ioc_io_queue_entry_t; + +#if 0 /* JRM */ /* keep this copy for convenience for now */ +typedef struct { + /* {Datasize, Offset, FileID} */ + int64_t header[3]; /* The basic RPC input plus */ + int tag; /* the supplied OPCODE tag */ + int source; /* Rank of who sent the message */ + int subfile_rank; /* The IOC rank */ + hid_t context_id; /* context to be used to complete */ + double start_time; /* the request, + time of receipt */ + /* from which we calc Time(queued) */ + void *buffer; /* for writes, we keep the buffer */ + /* around for awhile... */ + volatile int in_progress; /* Not used! */ + volatile int serialize; /* worker thread needs to wait while true */ + volatile int dependents; //* If current work item has dependents */ + int depend_id; /* work queue index of the dependent */ +} sf_work_request_t; + +struct hg_thread_work { + hg_thread_func_t func; + void * args; + HG_QUEUE_ENTRY(hg_thread_work) entry; /* Internal */ +}; + +#endif /* JRM */ + +/**************************************************************************** + * + * structure H5FD_ioc_io_queue + * + * This is a temporary structure -- its fields should be moved to an I/O + * concentrator Catchall structure eventually. + * + * The fields of this structure support the io queue used to receive and + * sequence I/O requests for execution by the worker threads. The rules + * for sequencing are as follows: + * + * 1) Non-overlaping I/O requests must be fed to the worker threads in + * the order received, and may execute concurrently + * + * 2) Overlapping read requests must be fed to the worker threads in + * the order received, but may execute concurrently. + * + * 3) If any pair of I/O requests overlap, and at least one is a write + * request, they must be executed in strict arrival order, and the + * first must complete before the second starts. + * + * Due to the strict ordering requirement in rule 3, entries must be + * inserted at the tail of the queue in receipt order, and retained on + * the queue until completed. Entries in the queue are marked pending + * when inserted on the queue, in progress when handed to a worker + * thread, and deleted from the queue when completed. + * + * The dispatch algorithm is as follows: + * + * 1) Set X equal to the element at the head of the queue. + * + * 2) If X is pending, and there exists no prior element (i.e. between X + * and the head of the queue) that intersects with X, goto 5). + * + * 3) If X is pending, X is a read, and all prior intersecting elements + * are reads, goto 5). + * + * 4) If X is in progress, or if any prior intersecting element is a + * write, or if X is a write, set X equal to its successor in the + * queue (i.e. the next element further down the queue from the head) + * and goto 2) If there is no next element, exit without dispatching + * any I/O request. + * + * 5) If we get to 5, X must be pending. Mark it in progress, and + * dispatch it. If the number of in progress entries is less than + * the number of worker threads, and X has a successor in the queue, + * set X equal to its predecessor, and goto 2). Otherwise exit without + * dispatching further I/O requests. + * + * Note that the above dispatch algorithm doesn't address collective + * I/O requests -- this should be OK for now, but it will have to + * addressed prior to production release. + * + * On I/O request completion, worker threads must delete their assigned + * I/O requests from the queue, check to see if there are any pending + * requests, and trigger the dispatch algorithm if there are. + * + * The fields in the structure are discussed individually below. + * + * magic: Unsigned 32 bit integer always set to H5FD_IOC__IO_Q_MAGIC. + * This field is used to validate pointers to instances of + * H5C_t. + * + * q_head: Pointer to the head of the doubly linked list of entries in + * the I/O queue. + * + * This field is NULL if the I/O queue is empty. + * + * q_tail: Pointer to the tail of the doubly linked list of entries in + * the I/O queue. + * + * This field is NULL if the I/O queue is empty. + * + * num_pending: Number of I/O request pending on the I/O queue. + * + * num_in_progress: Number of I/O requests in progress on the I/O queue. + * + * q_len: Number of I/O requests on the I/O queue. Observe that q_len + * must equal (num_pending + num_in_progress). + * + * req_counter: unsigned 16 bit integer used to provide a "unique" tag for + * each I/O request. This value is incremented by 1, and then + * passed to the worker thread where its lower bits are incorporated + * into the tag used to disambiguate multiple, concurrent I/O + * requests from a single rank. The value is 32 bits, as MPI tags + * are limited to 32 bits. The value is unsigned as it is expected + * to wrap around once its maximum value is reached. + * + * q_mutex: Mutex used to ensure that only one thread accesses the IOC I/O + * Queue at once. This mutex must be held to access of modify + * all fields of the + * + * + * Statistics: + * + * The following fields are only defined if H5FD_IOC__COLLECT_STATS is TRUE. + * They are intended to allow collection of basic statistics on the + * behaviour of the IOC I/O Queue for purposes of debugging and performance + * optimization. + * + * max_q_len: Maximum number of requests residing on the IOC I/O Queue at + * any point in time in the current run. + * + * max_num_pending: Maximum number of pending requests residing on the IOC + * I/O Queue at any point in time in the current run. + * + * max_num_in_progress: Maximum number of in progress requests residing on + * the IOC I/O Queue at any point in time in the current run. + * + * ind_read_requests: Number of independent read requests received by the + * IOC to date. + * + * ind_write_requests Number of independent write requests received by the + * IOC to date. + * + * truncate_requests: Number of truncate requests received by the IOC to + * date. + * + * get_eof_requests: Number fo get EOF request received by the IO to date. + * + * requests_queued: Number of I/O requests received and placed on the IOC + * I/O queue. + * + * requests_dispatched: Number of I/O requests dispatched for execution by + * the worker threads. + * + * requests_completed: Number of I/O requests completed by the worker threads. + * Observe that on file close, requests_queued, requests_dispatched, + * and requests_completed should be equal. + * + ****************************************************************************/ + +#define H5FD_IOC__IO_Q_MAGIC 0x2468 + +typedef struct H5FD_ioc_io_queue { + + uint32_t magic; + H5FD_ioc_io_queue_entry_t *q_head; + H5FD_ioc_io_queue_entry_t *q_tail; + int32_t num_pending; + int32_t num_in_progress; + int32_t q_len; + uint32_t req_counter; + hg_thread_mutex_t q_mutex; + + /* statistics */ +#if H5FD_IOC__COLLECT_STATS + int32_t max_q_len; + int32_t max_num_pending; + int32_t max_num_in_progress; + int64_t ind_read_requests; + int64_t ind_write_requests; + int64_t truncate_requests; + int64_t get_eof_requests; + int64_t requests_queued; + int64_t requests_dispatched; + int64_t requests_completed; +#endif /* H5FD_IOC__COLLECT_STATS */ + +} H5FD_ioc_io_queue_t; + +H5_DLL void H5FD_ioc_take_down_thread_pool(void); + +H5_DLL H5FD_ioc_io_queue_entry_t *H5FD_ioc__alloc_io_q_entry(void); +H5_DLL void H5FD_ioc__complete_io_q_entry(H5FD_ioc_io_queue_entry_t *entry_ptr); +H5_DLL void H5FD_ioc__dispatch_elegible_io_q_entries(void); +H5_DLL void H5FD_ioc__free_io_q_entry(H5FD_ioc_io_queue_entry_t *q_entry_ptr); +H5_DLL void H5FD_ioc__queue_io_q_entry(sf_work_request_t *wk_req_ptr); + +#endif /* JRM */ + +#ifdef __cplusplus +} +#endif + +#endif /* H5FDsubfiling_priv_H */ diff --git a/src/H5Pfapl.c b/src/H5Pfapl.c index 2c3caa88151..71f2e850781 100644 --- a/src/H5Pfapl.c +++ b/src/H5Pfapl.c @@ -1185,7 +1185,7 @@ H5P_set_driver(H5P_genplist_t *plist, hid_t new_driver_id, const void *new_drive * * Purpose: Set the file driver (DRIVER_ID) for a file access * property list (PLIST_ID) and supply an optional - * struct containing the driver-specific properites + * struct containing the driver-specific properties * (DRIVER_INFO). The driver properties will be copied into the * property list and the reference count on the driver will be * incremented, allowing the caller to close the driver ID but diff --git a/src/H5S.c b/src/H5S.c index 75d3399cbbf..009d8706b8f 100644 --- a/src/H5S.c +++ b/src/H5S.c @@ -230,6 +230,59 @@ H5S__close_cb(void *_space, void H5_ATTR_UNUSED **request) FUNC_LEAVE_NOAPI(ret_value) } /* end H5S__close_cb() */ +#if 1 /* JRM */ /* restore this function for now */ + +/*-------------------------------------------------------------------------- + NAME + H5S_get_validiated_dataspace + PURPOSE + Get a pointer to a validated H5S_t pointer + USAGE + H5S_t *H5S_get_validated_space(dataspace_id, space) + hid_t space_id; IN: The ID of the dataspace + const H5S_t * space; OUT: A pointer to the dataspace + RETURNS + SUCCEED/FAIL + DESCRIPTION + Gets a pointer to a dataspace struct after validating it. The pointer + can be NULL (if the ID is H5S_ALL, for example). + GLOBAL VARIABLES + COMMENTS, BUGS, ASSUMPTIONS + EXAMPLES + REVISION LOG +--------------------------------------------------------------------------*/ +herr_t +H5S_get_validated_dataspace(hid_t space_id, const H5S_t **space) +{ + herr_t ret_value = SUCCEED; /* Return value */ + + FUNC_ENTER_NOAPI(FAIL) + + HDassert(space); + + /* Check for invalid ID */ + if (space_id < 0) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "invalid space_id (ID cannot be a negative number)") + + /* No special dataspace struct for H5S_ALL */ + if (H5S_ALL == space_id) + *space = NULL; + else { + /* Get the dataspace pointer */ + if (NULL == (*space = (const H5S_t *)H5I_object_verify(space_id, H5I_DATASPACE))) + HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "space_id is not a dataspace ID") + + /* Check for valid selection */ + if (H5S_SELECT_VALID(*space) != TRUE) + HGOTO_ERROR(H5E_DATASPACE, H5E_BADRANGE, FAIL, "selection + offset not within extent") + } + +done: + FUNC_LEAVE_NOAPI(ret_value) +} /* end H5S_get_validated_dataspace() */ + +#endif /* JRM */ /* restore this function for now */ + /*-------------------------------------------------------------------------- NAME H5S_create diff --git a/src/H5Spoint.c b/src/H5Spoint.c index 240b72261ac..bc667b1e1d7 100644 --- a/src/H5Spoint.c +++ b/src/H5Spoint.c @@ -1060,7 +1060,7 @@ H5S__point_get_version_enc_size(const H5S_t *space, uint32_t *version, uint8_t * hsize_t bounds_start[H5S_MAX_RANK]; /* Starting coordinate of bounding box */ hsize_t bounds_end[H5S_MAX_RANK]; /* Opposite coordinate of bounding box */ hsize_t max_size = 0; /* Maximum selection size */ - unsigned u; /* Local index veriable */ + unsigned u; /* Local index variable */ herr_t ret_value = SUCCEED; /* Return value */ FUNC_ENTER_STATIC diff --git a/src/H5Sprivate.h b/src/H5Sprivate.h index 8a1456385c2..e3c7119c678 100644 --- a/src/H5Sprivate.h +++ b/src/H5Sprivate.h @@ -217,6 +217,9 @@ H5_DLL htri_t H5S_set_extent(H5S_t *space, const hsize_t *size); H5_DLL herr_t H5S_set_extent_real(H5S_t *space, const hsize_t *size); H5_DLL herr_t H5S_set_extent_simple(H5S_t *space, unsigned rank, const hsize_t *dims, const hsize_t *max); H5_DLL H5S_t *H5S_create(H5S_class_t type); +#if 1 /* JRM */ /* restore this for now */ +H5_DLL herr_t H5S_get_validated_dataspace(hid_t space_id, const H5S_t **space /*out*/); +#endif /* JRM */ H5_DLL H5S_t *H5S_create_simple(unsigned rank, const hsize_t dims[/*rank*/], const hsize_t maxdims[/*rank*/]); H5_DLL herr_t H5S_set_version(H5F_t *f, H5S_t *ds); H5_DLL herr_t H5S_encode(H5S_t *obj, unsigned char **p, size_t *nalloc); diff --git a/src/Makefile.am b/src/Makefile.am index c4023ae84c2..5374eb08722 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -119,6 +119,11 @@ if BUILD_PARALLEL_CONDITIONAL libhdf5_la_SOURCES += H5mpi.c H5ACmpio.c H5Cmpio.c H5Dmpio.c H5Fmpi.c H5FDmpi.c H5FDmpio.c H5Smpio.c endif +# Only compile the subfiling VFD if necessary +if SUBFILING_VFD_CONDITIONAL + libhdf5_la_SOURCES += H5FDsubfiling.c H5FDsubfile_int.c H5FDsubfile_mpi.c H5FDioc.c H5FDioc_threads.c +endif + # Only compile the direct VFD if necessary if DIRECT_VFD_CONDITIONAL libhdf5_la_SOURCES += H5FDdirect.c @@ -144,9 +149,10 @@ include_HEADERS = hdf5.h H5api_adpt.h H5overflow.h H5pubconf.h H5public.h H5vers H5Apublic.h H5ACpublic.h \ H5Cpublic.h H5Dpublic.h \ H5Epubgen.h H5Epublic.h H5ESpublic.h H5Fpublic.h \ - H5FDpublic.h H5FDcore.h H5FDdirect.h H5FDfamily.h H5FDhdfs.h \ + H5FDpublic.h H5FDcore.h H5FDdirect.h H5FDfamily.h H5FDhdfs.h H5FDioc.h \ H5FDlog.h H5FDmirror.h H5FDmpi.h H5FDmpio.h H5FDmulti.h H5FDros3.h \ - H5FDsec2.h H5FDsplitter.h H5FDstdio.h H5FDwindows.h \ + H5FDsec2.h H5FDsplitter.h H5FDsubfiling.h H5FDsubfiling_priv.h \ + H5FDstdio.h H5FDwindows.h \ H5Gpublic.h H5Ipublic.h H5Lpublic.h \ H5Mpublic.h H5MMpublic.h H5Opublic.h H5Ppublic.h \ H5PLextern.h H5PLpublic.h \ @@ -159,6 +165,21 @@ include_HEADERS = hdf5.h H5api_adpt.h H5overflow.h H5pubconf.h H5public.h H5vers include_HEADERS += H5ESdevelop.h H5FDdevelop.h H5Idevelop.h H5Ldevelop.h \ H5Tdevelop.h H5TSdevelop.h H5Zdevelop.h +if HAVE_MERCURY_CONDITIONAL + include_HEADERS += mercury/src/util/mercury_thread.h \ + mercury/src/util/mercury_thread_mutex.h mercury/src/util/mercury_thread_pool.h + + libhdf5_la_SOURCES += mercury/src/util/mercury_atomic_queue.c \ + mercury/src/util/mercury_dlog.c mercury/src/util/mercury_event.c \ + mercury/src/util/mercury_hash_table.c mercury/src/util/mercury_log.c \ + mercury/src/util/mercury_mem.c mercury/src/util/mercury_mem_pool.c \ + mercury/src/util/mercury_poll.c mercury/src/util/mercury_request.c \ + mercury/src/util/mercury_thread.c mercury/src/util/mercury_thread_condition.c \ + mercury/src/util/mercury_thread_pool.c mercury/src/util/mercury_thread_mutex.c \ + mercury/src/util/mercury_thread_rwlock.c mercury/src/util/mercury_thread_spin.c \ + mercury/src/util/mercury_util.c +endif + # install libhdf5.settings in lib directory settingsdir=$(libdir) settings_DATA=libhdf5.settings diff --git a/src/mercury/COPYING b/src/mercury/COPYING new file mode 100644 index 00000000000..42095c5e28e --- /dev/null +++ b/src/mercury/COPYING @@ -0,0 +1,39 @@ +Copyright (C) 2013-2020, Argonne National Laboratory, Department of Energy, + UChicago Argonne, LLC and The HDF Group. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted for any purpose (including commercial purposes) +provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions, and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions, and the following disclaimer in the documentation + and/or materials provided with the distribution. + +3. In addition, redistributions of modified forms of the source or binary + code must carry prominent notices stating that the original code was + changed and the date of the change. + +4. All publications or advertising materials mentioning features or use of + this software are asked, but not required, to acknowledge that it was + developed by ANL / the university of Chicago / The HDF Group and credit + the contributors. + +5. Neither the name of ANL / the university of Chicago / The HDF Group, nor + the name of any Contributor may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/src/mercury/README.md b/src/mercury/README.md new file mode 100644 index 00000000000..0e7e6e58834 --- /dev/null +++ b/src/mercury/README.md @@ -0,0 +1,221 @@ +Mercury +======= +[![Build status][travis-ci-svg]][travis-ci-link] +[![Latest version][mercury-release-svg]][mercury-release-link] + + Mercury is an RPC framework specifically designed for use in HPC systems + that allows asynchronous transfer of parameters and execution requests, + as well as direct support of large data arguments. The network implementation + is abstracted, allowing easy porting to future systems and efficient use + of existing native transport mechanisms. Mercury's interface is generic + and allows any function call to be serialized. + + Please see the accompanying COPYING file for license details. + + Contributions and patches are welcomed but require a Contributor License + Agreement (CLA) to be filled out. Please contact us if you are interested + in contributing to Mercury by subscribing to the [mailing lists][mailing-lists]. + +Architectures supported +======================= + + Architectures supported by MPI implementations are generally supported by the + network abstraction layer. The OFI libfabric plugin as well as the SM plugin + are stable and provide the best performance in most workloads. Libfabric + providers currently supported are: `tcp`, `verbs`, `psm2`, `gni`. + MPI and BMI (tcp) plugins are still supported but gradually being moved as + deprecated, therefore should only be used as fallback methods. + The CCI plugin is deprecated and underlying CCI transport plugins + (`tcp`, `sm`, `verbs`, `gni`) are no longer supported. + + See the [plugin requirements](#plugin-requirements) section for + plugin requirement details. + +Documentation +============= + + Please see the documentation available on the mercury [website][documentation] + for a quick introduction to Mercury. + +Software requirements +===================== + + Compiling and running Mercury requires up-to-date versions of various + software packages. Beware that using excessively old versions of these + packages can cause indirect errors that are very difficult to track down. + +Plugin requirements +------------------- + +To make use of the libfabric/OFI plugin, please refer to the libfabric build +instructions available on this [page][libfabric]. + +To make use of the native NA SM (shared-memory) plugin on Linux, +the cross-memory attach (CMA) feature introduced in kernel v3.2 is required. +The yama security module must also be configured to allow remote process memory +to be accessed (see this [page][yama]). On MacOS, code signing with inclusion of +the na_sm.plist file into the binary is currently required to allow process +memory to be accessed. + +To make use of the BMI plugin, the most convenient way is to install it through +spack or one can also do: + + git clone https://xgitlab.cels.anl.gov/sds/bmi.git && cd bmi + ./prepare && ./configure --enable-shared --enable-bmi-only + make && make install + +To make use of the MPI plugin, Mercury requires a _well-configured_ MPI +implementation (MPICH2 v1.4.1 or higher / OpenMPI v1.6 or higher) with +`MPI_THREAD_MULTIPLE` available on targets that will accept remote +connections. Processes that are _not_ accepting incoming connections are +_not_ required to have a multithreaded level of execution. + +To make use of the CCI plugin, please refer to the CCI build instructions +available on this [page][cci]. + +Optional requirements +--------------------- + +For optional automatic code generation features (which are used for generating +serialization and deserialization routines), the preprocessor subset of the +BOOST library must be included (Boost v1.48 or higher is recommended). +The library itself is therefore not necessary since only the header is used. +Mercury includes those headers if one does not have BOOST installed and +wants to make use of this feature. + +On Linux OpenPA v1.0.3 or higher is required (the version that is included +with MPICH can also be used) for systems that do not have `stdatomic.h` +(GCC version less than 4.9). + +Building +======== + +If you install the full sources, put the tarball in a directory where you +have permissions (e.g., your home directory) and unpack it: + + gzip -cd mercury-X.tar.gz | tar xvf - + + or + + bzip2 -dc mercury-X.tar.bz2 | tar xvf - + +Replace `'X'` with the version number of the package. + +(Optional) If you checked out the sources using git (without the `--recursive` +option) and want to build the testing suite (which requires the kwsys +submodule) or use checksums (which requires the mchecksum submodule), you need +to issue from the root of the source directory the following command: + + git submodule update --init + +Mercury makes use of the CMake build-system and requires that you do an +out-of-source build. In order to do that, you must create a new build +directory and run the `ccmake` command from it: + + cd mercury-X + mkdir build + cd build + ccmake .. (where ".." is the relative path to the mercury-X directory) + +Type `'c'` multiple times and choose suitable options. Recommended options are: + + BUILD_SHARED_LIBS ON (or OFF if the library you link + against requires static libraries) + BUILD_TESTING ON + Boost_INCLUDE_DIR /path/to/include/directory + CMAKE_INSTALL_PREFIX /path/to/install/directory + MERCURY_ENABLE_DEBUG ON/OFF + MERCURY_ENABLE_PARALLEL_TESTING ON/OFF + MERCURY_USE_BOOST_PP ON + MERCURY_USE_CHECKSUMS ON + MERCURY_USE_SYSTEM_BOOST ON/OFF + MERCURY_USE_SYSTEM_MCHECKSUM ON/OFF + MERCURY_USE_XDR OFF + NA_USE_BMI ON/OFF + NA_USE_MPI ON/OFF + NA_USE_CCI ON/OFF + NA_USE_OFI ON/OFF + NA_USE_SM ON/OFF + +Setting include directory and library paths may require you to toggle to +the advanced mode by typing `'t'`. Once you are done and do not see any +errors, type `'g'` to generate makefiles. Once you exit the CMake +configuration screen and are ready to build the targets, do: + + make + +(Optional) Verbose compile/build output: + +This is done by inserting `VERBOSE=1` in the `make` command. E.g.: + + make VERBOSE=1 + +Installing +========== + +Assuming that the `CMAKE_INSTALL_PREFIX` has been set (see previous step) +and that you have write permissions to the destination directory, do +from the build directory: + + make install + +Testing +======= + +Tests can be run to check that basic RPC functionality (requests and bulk +data transfers) is properly working. CTest is used to run the tests, +simply run from the build directory: + + ctest . + +(Optional) Verbose testing: + +This is done by inserting `-V` in the `ctest` command. E.g.: + + ctest -V . + +Extra verbose information can be displayed by inserting `-VV`. E.g.: + + ctest -VV . + +Some tests run with one server process and X client processes. To change the +number of client processes that are being used, the `MPIEXEC_MAX_NUMPROCS` +variable needs to be modified (toggle to advanced mode if you do not see +it). The default value is automatically detected by CMake based on the number +of cores that are available. +Note that you need to run `make` again after the makefile generation +to use the new value. + +FAQ +=== + +Below is a list of the most common questions. + +- _Q: Why am I getting undefined references to libfabric symbols?_ + + A: In rare occasions, multiple copies of the libfabric library are installed + on the same system. To make sure that you are using the correct copy of the + libfabric library, do: + + ldconfig -p | grep libfabric + + If the library returned is not the one that you would expect, make sure to + either set `LD_LIBRARY_PATH` or add an entry in your `/etc/ld.so.conf.d` + directory. + +- _Q: Is there any logging mechanism?_ + + A: To turn on error/warning/debug logs, the `HG_LOG_LEVEL` or + `HG_NA_LOG_LEVEL` environment variables can be set to either `error`, + `warning` or `debug` values. Note that for debugging output to be printed, + the CMake variable `MERCURY_ENABLE_DEBUG` must also be set at compile time. + +[mailing-lists]: http://mercury-hpc.github.io/help#mailing-lists +[documentation]: http://mercury-hpc.github.io/documentation/ +[cci]: http://cci-forum.com/?page_id=46 +[libfabric]: https://github.com/ofiwg/libfabric +[travis-ci-svg]: https://travis-ci.org/mercury-hpc/mercury.svg +[travis-ci-link]: https://travis-ci.org/mercury-hpc/mercury +[mercury-release-svg]: https://img.shields.io/github/release/mercury-hpc/mercury.svg +[mercury-release-link]: https://github.com/mercury-hpc/mercury/releases/latest +[yama]: https://www.kernel.org/doc/Documentation/security/Yama.txt diff --git a/src/mercury/include/mercury.h b/src/mercury/include/mercury.h new file mode 100644 index 00000000000..9f44012bac9 --- /dev/null +++ b/src/mercury/include/mercury.h @@ -0,0 +1,1060 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_H +#define MERCURY_H + +#include "mercury_header.h" +#include "mercury_types.h" + +#include "mercury_core.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/* See mercury_types.h */ + +/*****************/ +/* Public Macros */ +/*****************/ + +/* See mercury_types.h */ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Get Mercury version number. + * + * \param major [OUT] pointer to unsigned integer + * \param minor [OUT] pointer to unsigned integer + * \param patch [OUT] pointer to unsigned integer + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Version_get(unsigned int *major, unsigned int *minor, unsigned int *patch); + +/** + * Convert error return code to string (null terminated). + * + * \param errnum [IN] error return code + * + * \return String + */ +HG_PUBLIC const char *HG_Error_to_string(hg_return_t errnum); + +/** + * Initialize the Mercury layer. + * Must be finalized with HG_Finalize(). + * + * \param na_info_string [IN] host address with port number (e.g., + * "tcp://localhost:3344" or + * "bmi+tcp://localhost:3344") + * \param na_listen [IN] listen for incoming connections + * + * \return Pointer to HG class or NULL in case of failure + */ +HG_PUBLIC hg_class_t *HG_Init(const char *na_info_string, hg_bool_t na_listen); + +/** + * Initialize the Mercury layer with options provided by init_info. + * Must be finalized with HG_Finalize(). + * \remark HG_Init_opt() may become HG_Init() in the future. + * + * \param na_info_string [IN] host address with port number (e.g., + * "tcp://localhost:3344" or + * "bmi+tcp://localhost:3344") + * \param na_listen [IN] listen for incoming connections + * \param hg_init_info [IN] (Optional) HG init info, NULL if no info + * + * \return Pointer to HG class or NULL in case of failure + */ +HG_PUBLIC hg_class_t *HG_Init_opt(const char *na_info_string, hg_bool_t na_listen, + const struct hg_init_info *hg_init_info); + +/** + * Finalize the Mercury layer. + * + * \param hg_class [IN] pointer to HG class + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Finalize(hg_class_t *hg_class); + +/** + * Clean up all temporary files that were created in previous HG instances. + * While temporary resources (e.g., tmp files) are cleaned up on a call + * to HG_Finalize(), this routine gives a chance to programs that terminate + * abnormally to easily clean up those resources. + */ +HG_PUBLIC void HG_Cleanup(void); + +/** + * Set the log level for HG. That setting is valid for all HG classes. + * + * \param level [IN] level string, valid values are: + * "none", "error", "warning", "debug" + */ +HG_PUBLIC void HG_Set_log_level(const char *level); + +/** + * Set the log sub-system for HG. That setting is valid for all HG classes. + * + * \param subsys [IN] string of subsystems, format is: + * subsys1,subsys2,subsys3,etc + * subsystem can be turned off, e.g.: + * ~subsys1 + */ +HG_PUBLIC void HG_Set_log_subsys(const char *subsys); + +/** + * Obtain the name of the given class. + * + * \param hg_class [IN] pointer to HG class + * + * \return the name of the class, or NULL if not a valid class + */ +static HG_INLINE const char *HG_Class_get_name(const hg_class_t *hg_class); + +/** + * Obtain the protocol of the given class. + * + * \param hg_class [IN] pointer to HG class + * + * \return the name of the class's transport, or NULL if not a valid class + */ +static HG_INLINE const char *HG_Class_get_protocol(const hg_class_t *hg_class); + +/** + * Test whether class is listening or not. + * + * \param hg_class [IN] pointer to HG class + * + * \return HG_TRUE if listening or HG_FALSE if not, or not a valid class + */ +static HG_INLINE hg_bool_t HG_Class_is_listening(const hg_class_t *hg_class); + +/** + * Obtain the maximum eager size for sending RPC inputs, for a given class. + * NOTE: This doesn't currently work when using XDR encoding. + * + * \param hg_class [IN] pointer to HG class + * + * \return the maximum size, or 0 if hg_class is not a valid class or XDR is + * being used + */ +static HG_INLINE hg_size_t HG_Class_get_input_eager_size(const hg_class_t *hg_class); + +/** + * Obtain the maximum eager size for sending RPC outputs, for a given class. + * NOTE: This doesn't currently work when using XDR encoding. + * + * \param hg_class [IN] pointer to HG class + * + * \return the maximum size, or 0 if hg_class is not a valid class or XDR is + * being used + */ +static HG_INLINE hg_size_t HG_Class_get_output_eager_size(const hg_class_t *hg_class); + +/** + * Set offset used for serializing / deserializing input. This allows upper + * layers to manually define a reserved space that can be used for the + * definition of custom headers. The actual input is encoded / decoded + * using the defined offset. By default, no offset is set. + * + * \param hg_class [IN] pointer to HG class + * \param offset [IN] offset size + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Class_set_input_offset(hg_class_t *hg_class, hg_size_t offset); + +/** + * Set offset used for serializing / deserializing output. This allows upper + * layers to manually define a reserved space that can be used for the + * definition of custom headers. The actual output is encoded / decoded + * using the defined offset. By default, no offset is set. + * + * \param hg_class [IN] pointer to HG class + * \param offset [IN] offset size + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Class_set_output_offset(hg_class_t *hg_class, hg_size_t offset); + +/** + * Associate user data to class. When HG_Finalize() is called, + * free_callback (if defined) is called to free the associated data. + * + * \param hg_class [IN] pointer to HG class + * \param data [IN] pointer to user data + * \param free_callback [IN] pointer to function + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Class_set_data(hg_class_t *hg_class, void *data, + void (*free_callback)(void *)); + +/** + * Retrieve previously associated data from a given class. + * + * \param hg_class [IN] pointer to HG class + * + * \return Pointer to user data or NULL if not set or any error has occurred + */ +static HG_INLINE void *HG_Class_get_data(const hg_class_t *hg_class); + +/** + * Set callback to be called on HG handle creation. Handles are created + * both on HG_Create() and HG_Context_create() calls. This allows upper layers + * to create and attach data to a handle (using HG_Set_data()) and later + * retrieve it using HG_Get_data(). + * + * \param hg_class [IN] pointer to HG class + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Class_set_handle_create_callback(hg_class_t *hg_class, + hg_return_t (*callback)(hg_handle_t, void *), + void *arg); + +/** + * Create a new context. Must be destroyed by calling HG_Context_destroy(). + * + * \remark This routine is internally equivalent to: + * - HG_Core_context_create() + * - If listening + * - HG_Core_context_post() with repost set to HG_TRUE + * + * \param hg_class [IN] pointer to HG class + * + * \return Pointer to HG context or NULL in case of failure + */ +HG_PUBLIC hg_context_t *HG_Context_create(hg_class_t *hg_class); + +/** + * Create a new context with a user-defined context identifier. The context + * identifier can be used to route RPC requests to specific contexts by using + * HG_Set_target_id(). + * Context must be destroyed by calling HG_Context_destroy(). + * + * \remark This routine is internally equivalent to: + * - HG_Core_context_create_id() with specified context ID + * - If listening + * - HG_Core_context_post() with repost set to HG_TRUE + * + * \param hg_class [IN] pointer to HG class + * \param id [IN] user-defined context ID + * + * \return Pointer to HG context or NULL in case of failure + */ +HG_PUBLIC hg_context_t *HG_Context_create_id(hg_class_t *hg_class, hg_uint8_t id); + +/** + * Destroy a context created by HG_Context_create(). + * + * \param context [IN] pointer to HG context + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Context_destroy(hg_context_t *context); + +/** + * Retrieve the class used to create the given context. + * + * \param context [IN] pointer to HG context + * + * \return Pointer to associated HG class or NULL if not a valid context + */ +static HG_INLINE hg_class_t *HG_Context_get_class(const hg_context_t *context); + +/** + * Retrieve context ID from context (max value of 255). + * + * \param context [IN] pointer to HG context + * + * \return Non-negative integer (max value of 255) or 0 if no ID has been set + */ +static HG_INLINE hg_uint8_t HG_Context_get_id(const hg_context_t *context); + +/** + * Associate user data to context. When HG_Context_destroy() is called, + * free_callback (if defined) is called to free the associated data. + * + * \param context [IN] pointer to HG context + * \param data [IN] pointer to user data + * \param free_callback [IN] pointer to function + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Context_set_data(hg_context_t *context, void *data, + void (*free_callback)(void *)); + +/** + * Retrieve previously associated data from a given context. + * + * \param context [IN] pointer to HG context + * + * \return Pointer to user data or NULL if not set or any error has occurred + */ +static HG_INLINE void *HG_Context_get_data(const hg_context_t *context); + +/** + * Dynamically register a function func_name as an RPC as well as the + * RPC callback executed when the RPC request ID associated to func_name is + * received. Associate input and output proc to function ID, so that they can + * be used to serialize and deserialize function parameters. + * + * \param hg_class [IN] pointer to HG class + * \param func_name [IN] unique name associated to function + * \param in_proc_cb [IN] pointer to input proc callback + * \param out_proc_cb [IN] pointer to output proc callback + * \param rpc_cb [IN] RPC callback + * + * \return unique ID associated to the registered function + */ +HG_PUBLIC hg_id_t HG_Register_name(hg_class_t *hg_class, const char *func_name, hg_proc_cb_t in_proc_cb, + hg_proc_cb_t out_proc_cb, hg_rpc_cb_t rpc_cb); + +/* + * Indicate whether HG_Register_name() has been called for the RPC specified by + * func_name. + * + * \param hg_class [IN] pointer to HG class + * \param func_name [IN] function name + * \param id [OUT] registered RPC ID + * \param flag [OUT] pointer to boolean + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Registered_name(hg_class_t *hg_class, const char *func_name, hg_id_t *id, + hg_bool_t *flag); + +/** + * Dynamically register an RPC ID as well as the RPC callback executed when the + * RPC request ID is received. Associate input and output proc to id, so that + * they can be used to serialize and deserialize function parameters. + * + * \param hg_class [IN] pointer to HG class + * \param id [IN] ID to use to register RPC + * \param in_proc_cb [IN] pointer to input proc callback + * \param out_proc_cb [IN] pointer to output proc callback + * \param rpc_cb [IN] RPC callback + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Register(hg_class_t *hg_class, hg_id_t id, hg_proc_cb_t in_proc_cb, + hg_proc_cb_t out_proc_cb, hg_rpc_cb_t rpc_cb); + +/** + * Deregister RPC ID. Further requests with RPC ID will return an error, it + * is therefore up to the user to make sure that all requests for that RPC ID + * have been treated before it is unregistered. + * + * \param hg_class [IN] pointer to HG class + * \param id [IN] registered function ID + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Deregister(hg_class_t *hg_class, hg_id_t id); + +/** + * Indicate whether HG_Register() has been called. + * + * \param hg_class [IN] pointer to HG class + * \param id [IN] function ID + * \param flag [OUT] pointer to boolean + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Registered(hg_class_t *hg_class, hg_id_t id, hg_bool_t *flag); + +/** + * Indicate whether HG_Register() has been called, and if so return pointers + * to proc callback functions for the RPC. + * + * \param hg_class [IN] pointer to HG class + * \param id [IN] function ID + * \param flag [OUT] pointer to boolean + * \param in_proc_cb [OUT] pointer to input encoder cb + * \param out_proc_cb [OUT] pointer to output encoder cb + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Registered_proc_cb(hg_class_t *hg_class, hg_id_t id, hg_bool_t *flag, + hg_proc_cb_t *in_proc_cb, hg_proc_cb_t *out_proc_cb); + +/** + * Register and associate user data to registered function. When HG_Finalize() + * is called, free_callback (if defined) is called to free the registered + * data. + * + * \param hg_class [IN] pointer to HG class + * \param id [IN] registered function ID + * \param data [IN] pointer to data + * \param free_callback [IN] pointer to function + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Register_data(hg_class_t *hg_class, hg_id_t id, void *data, + void (*free_callback)(void *)); + +/** + * Indicate whether HG_Register_data() has been called and return associated + * data. + * + * \param hg_class [IN] pointer to HG class + * \param id [IN] registered function ID + * + * \return Pointer to data or NULL + */ +HG_PUBLIC void *HG_Registered_data(hg_class_t *hg_class, hg_id_t id); + +/** + * Disable response for a given RPC ID. This allows an origin process to send an + * RPC to a target without waiting for a response. The RPC completes locally and + * the callback on the origin is therefore pushed to the completion queue once + * the RPC send is completed. By default, all RPCs expect a response to + * be sent back. + * + * \param hg_class [IN] pointer to HG class + * \param id [IN] registered function ID + * \param disable [IN] boolean (HG_TRUE to disable + * HG_FALSE to re-enable) + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Registered_disable_response(hg_class_t *hg_class, hg_id_t id, hg_bool_t disable); + +/** + * Check if response is disabled for a given RPC ID + * (i.e., HG_Registered_disable_response() has been called for this RPC ID). + * + * \param hg_class [IN] pointer to HG class + * \param id [IN] registered function ID + * \param disabled [OUT] boolean (HG_TRUE if disabled + * HG_FALSE if enabled) + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Registered_disabled_response(hg_class_t *hg_class, hg_id_t id, hg_bool_t *disabled); + +/** + * Lookup an addr from a peer address/name. Addresses need to be + * freed by calling HG_Addr_free(). After completion, user callback is + * placed into a completion queue and can be triggered using HG_Trigger(). + * + * \param context [IN] pointer to context of execution + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param name [IN] lookup name + * \param op_id [OUT] pointer to returned operation ID (unused) + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Addr_lookup1(hg_context_t *context, hg_cb_t callback, void *arg, const char *name, + hg_op_id_t *op_id); + +/* This will map to HG_Addr_lookup2() in the future */ +#ifndef HG_Addr_lookup +#define HG_Addr_lookup HG_Addr_lookup1 +#endif + +/** + * Lookup an addr from a peer address/name. Addresses need to be + * freed by calling HG_Addr_free(). + * + * \remark This is the immediate version of HG_Addr_lookup1(). + * + * \param hg_class [IN/OUT] pointer to HG class + * \param name [IN] lookup name + * \param addr [OUT] pointer to abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Addr_lookup2(hg_class_t *hg_class, const char *name, hg_addr_t *addr); + +/** + * Free the addr. + * + * \param hg_class [IN] pointer to HG class + * \param addr [IN] abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Addr_free(hg_class_t *hg_class, hg_addr_t addr); + +/** + * Hint that the address is no longer valid. This may happen if the peer is + * no longer responding. This can be used to force removal of the + * peer address from the list of the peers, before freeing it and reclaim + * resources. + * + * \param hg_class [IN] pointer to HG class + * \param addr [IN] abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Addr_set_remove(hg_class_t *hg_class, hg_addr_t addr); + +/** + * Access self address. Address must be freed with HG_Addr_free(). + * + * \param hg_class [IN] pointer to HG class + * \param addr [OUT] pointer to abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Addr_self(hg_class_t *hg_class, hg_addr_t *addr); + +/** + * Duplicate an existing HG abstract address. The duplicated address can be + * stored for later use and the origin address be freed safely. The duplicated + * address must be freed with HG_Addr_free(). + * + * \param hg_class [IN] pointer to HG class + * \param addr [IN] abstract address + * \param new_addr [OUT] pointer to abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Addr_dup(hg_class_t *hg_class, hg_addr_t addr, hg_addr_t *new_addr); + +/** + * Compare two addresses. + * + * \param hg_class [IN] pointer to HG class + * \param addr1 [IN] abstract address + * \param addr2 [IN] abstract address + * + * \return HG_TRUE if addresses are determined to be equal, HG_FALSE otherwise + */ +HG_PUBLIC hg_bool_t HG_Addr_cmp(hg_class_t *hg_class, hg_addr_t addr1, hg_addr_t addr2); + +/** + * Convert an addr to a string (returned string includes the terminating + * null byte '\0'). If buf is NULL, the address is not converted and only + * the required size of the buffer is returned. If the input value passed + * through buf_size is too small, HG_SIZE_ERROR is returned and the buf_size + * output is set to the minimum size required. + * + * \param hg_class [IN] pointer to HG class + * \param buf [IN/OUT] pointer to destination buffer + * \param buf_size [IN/OUT] pointer to buffer size + * \param addr [IN] abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Addr_to_string(hg_class_t *hg_class, char *buf, hg_size_t *buf_size, hg_addr_t addr); + +/** + * Initiate a new HG RPC using the specified function ID and the local/remote + * target defined by addr. The HG handle created can be used to query input + * and output, as well as issuing the RPC by calling HG_Forward(). + * After completion the handle must be freed using HG_Destroy(). + * + * \param context [IN] pointer to HG context + * \param addr [IN] abstract network address of destination + * \param id [IN] registered function ID + * \param handle [OUT] pointer to HG handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Create(hg_context_t *context, hg_addr_t addr, hg_id_t id, hg_handle_t *handle); + +/** + * Destroy HG handle. Decrement reference count, resources associated to the + * handle are freed when the reference count is null. + * + * \param handle [IN] HG handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Destroy(hg_handle_t handle); + +/** + * Reset an existing HG handle to make it reusable for RPC forwarding. + * Both target address and RPC ID can be modified at this time. + * Operations on that handle must be completed in order to reset that handle + * safely. + * + * \param handle [IN] HG handle + * \param addr [IN] abstract network address of destination + * \param id [IN] registered function ID + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Reset(hg_handle_t handle, hg_addr_t addr, hg_id_t id); + +/** + * Increment ref count on handle. + * + * \param handle [IN] HG handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Ref_incr(hg_handle_t hg_handle); + +/** + * Retrieve ref count from handle. + * + * \param handle [IN] HG handle + * + * \return Non-negative value or negative if the handle is not valid + */ +static HG_INLINE hg_int32_t HG_Ref_get(hg_handle_t handle); + +/** + * Get info from handle. + * + * \remark Users must call HG_Addr_dup() to safely re-use the addr field. + * + * \param handle [IN] HG handle + * + * \return Pointer to info or NULL in case of failure + */ +static HG_INLINE const struct hg_info *HG_Get_info(hg_handle_t handle); + +/** + * Associate user data to handle. When HG_Destroy() is called, + * free_callback (if defined) is called to free the associated data. + * + * \param handle [IN] HG handle + * \param data [IN] pointer to user data + * \param free_callback [IN] pointer to function + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Set_data(hg_handle_t handle, void *data, void (*free_callback)(void *)); + +/** + * Retrieve previously associated data from a given handle. + * + * \param handle [IN] HG handle + * + * \return Pointer to user data or NULL if not set or any error has occurred + */ +static HG_INLINE void *HG_Get_data(hg_handle_t handle); + +/** + * Get input from handle (requires registration of input proc to deserialize + * parameters). Input must be freed using HG_Free_input(). + * + * \remark This is equivalent to: + * - HG_Core_get_input() + * - Call hg_proc to deserialize parameters + * + * \param handle [IN] HG handle + * \param in_struct [IN/OUT] pointer to input structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Get_input(hg_handle_t handle, void *in_struct); + +/** + * Free resources allocated when deserializing the input. + * User may copy parameters contained in the input structure before calling + * HG_Free_input(). + * + * \param handle [IN] HG handle + * \param in_struct [IN/OUT] pointer to input structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Free_input(hg_handle_t handle, void *in_struct); + +/** + * Get output from handle (requires registration of output proc to deserialize + * parameters). Output must be freed using HG_Free_output(). + * + * \remark This is equivalent to: + * - HG_Core_get_output() + * - Call hg_proc to deserialize parameters + * + * + * \param handle [IN] HG handle + * \param out_struct [IN/OUT] pointer to output structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Get_output(hg_handle_t handle, void *out_struct); + +/** + * Free resources allocated when deserializing the output. + * User may copy parameters contained in the output structure before calling + * HG_Free_output(). + * + * \param handle [IN] HG handle + * \param out_struct [IN/OUT] pointer to input structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Free_output(hg_handle_t handle, void *out_struct); + +/** + * Get raw input buffer from handle that can be used for encoding and decoding + * parameters. + * + * \remark Can be used for manual encoding / decoding when HG proc routines + * cannot be automatically used or there is need for special handling before + * HG_Get_input() can be called, for instance when using a custom header. + * To use proc routines conjunctively, HG_Class_set_input_offset() can be used + * to define the offset at which HG_Forward() / HG_Get_input() will start + * encoding / decoding the input parameters. + * + * \remark in_buf_size argument will be ignored if NULL + * + * \param handle [IN] HG handle + * \param in_buf [OUT] pointer to input buffer + * \param in_buf_size [OUT] pointer to input buffer size + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Get_input_buf(hg_handle_t handle, void **in_buf, hg_size_t *in_buf_size); + +/** + * Get raw output buffer from handle that can be used for encoding and decoding + * parameters. + * + * \remark Can be used for manual encoding / decoding when HG proc routines + * cannot be automatically used or there is need for special handling before + * HG_Get_output() can be called, for instance when using a custom header. + * To use proc routines conjunctively, HG_Class_set_output_offset() can be used + * to define the offset at which HG_Respond() / HG_Get_output() will start + * encoding / decoding the output parameters. + * + * \remark out_buf_size argument will be ignored if NULL + * + * \param handle [IN] HG handle + * \param out_buf [OUT] pointer to output buffer + * \param out_buf_size [OUT] pointer to output buffer size + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Get_output_buf(hg_handle_t handle, void **out_buf, hg_size_t *out_buf_size); + +/** + * Get raw extra input buffer from handle that can be used for encoding and + * decoding parameters. This buffer is only valid if the input payload is large + * enough that it cannot fit into an eager buffer. + * + * \remark NULL pointer will be returned if there is no associated buffer. + * + * \remark in_buf_size argument will be ignored if NULL. + * + * \param handle [IN] HG handle + * \param in_buf [OUT] pointer to input buffer + * \param in_buf_size [OUT] pointer to input buffer size + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Get_input_extra_buf(hg_handle_t handle, void **in_buf, hg_size_t *in_buf_size); + +/** + * Get raw extra output buffer from handle that can be used for encoding and + * decoding parameters. This buffer is only valid if the output payload is large + * enough that it cannot fit into an eager buffer. + * + * \remark NULL pointer will be returned if there is no associated buffer. + * + * \remark out_buf_size argument will be ignored if NULL. + * + * \param handle [IN] HG handle + * \param out_buf [OUT] pointer to output buffer + * \param out_buf_size [OUT] pointer to output buffer size + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Get_output_extra_buf(hg_handle_t handle, void **out_buf, hg_size_t *out_buf_size); + +/** + * Set target context ID that will receive and process the RPC request + * (ID is defined on target context creation, see HG_Context_create_id()). + * + * \param handle [IN] HG handle + * \param id [IN] user-defined target context ID + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Set_target_id(hg_handle_t handle, hg_uint8_t id); + +/** + * Forward a call to a local/remote target using an existing HG handle. + * Input structure can be passed and parameters serialized using a previously + * registered input proc. After completion, user callback is placed into a + * completion queue and can be triggered using HG_Trigger(). RPC output can + * be queried using HG_Get_output() and freed using HG_Free_output(). + * + * \remark This routine is internally equivalent to: + * - HG_Core_get_input() + * - Call hg_proc to serialize parameters + * - HG_Core_forward() + * + * \param handle [IN] HG handle + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param in_struct [IN] pointer to input structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Forward(hg_handle_t handle, hg_cb_t callback, void *arg, void *in_struct); + +/** + * Respond back to origin using an existing HG handle. + * Output structure can be passed and parameters serialized using a previously + * registered output proc. After completion, user callback is placed into a + * completion queue and can be triggered using HG_Trigger(). + * + * \remark This routine is internally equivalent to: + * - HG_Core_get_output() + * - Call hg_proc to serialize parameters + * - HG_Core_respond() + * + * \param handle [IN] HG handle + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param out_struct [IN] pointer to output structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Respond(hg_handle_t handle, hg_cb_t callback, void *arg, void *out_struct); + +/** + * Try to progress RPC execution for at most timeout until timeout is reached or + * any completion has occurred. + * Progress should not be considered as wait, in the sense that it cannot be + * assumed that completion of a specific operation will occur only when + * progress is called. + * + * \param context [IN] pointer to HG context + * \param timeout [IN] timeout (in milliseconds) + * + * \return HG_SUCCESS if any completion has occurred / HG error code otherwise + */ +HG_PUBLIC hg_return_t HG_Progress(hg_context_t *context, unsigned int timeout); + +/** + * Execute at most max_count callbacks. If timeout is non-zero, wait up to + * timeout before returning. Function can return when at least one or more + * callbacks are triggered (at most max_count). + * + * \param context [IN] pointer to HG context + * \param timeout [IN] timeout (in milliseconds) + * \param max_count [IN] maximum number of callbacks triggered + * \param actual_count [IN] actual number of callbacks triggered + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Trigger(hg_context_t *context, unsigned int timeout, unsigned int max_count, + unsigned int *actual_count); + +/** + * Cancel an ongoing operation. + * + * \param handle [IN] HG handle + * + * \return HG_SUCCESS or HG_CANCEL_ERROR or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Cancel(hg_handle_t handle); + +/************************************/ +/* Local Type and Struct Definition */ +/************************************/ + +/* HG class */ +struct hg_class { + hg_core_class_t *core_class; /* Core class */ + hg_size_t in_offset; /* Input offset */ + hg_size_t out_offset; /* Output offset */ +}; + +/* HG context */ +struct hg_context { + hg_core_context_t *core_context; /* Core context */ + hg_class_t * hg_class; /* HG class */ +}; + +/* HG handle */ +struct hg_handle { + struct hg_info info; /* HG info */ + hg_core_handle_t core_handle; /* Core handle */ + void * data; /* User data */ + void (*data_free_callback)(void *); /* User data free callback */ +}; + +/*---------------------------------------------------------------------------*/ +static HG_INLINE const char * +HG_Class_get_name(const hg_class_t *hg_class) +{ + return HG_Core_class_get_name(hg_class->core_class); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE const char * +HG_Class_get_protocol(const hg_class_t *hg_class) +{ + return HG_Core_class_get_protocol(hg_class->core_class); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_bool_t +HG_Class_is_listening(const hg_class_t *hg_class) +{ + return HG_Core_class_is_listening(hg_class->core_class); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_size_t +HG_Class_get_input_eager_size(const hg_class_t *hg_class) +{ + hg_size_t core = HG_Core_class_get_input_eager_size(hg_class->core_class), + header = hg_header_get_size(HG_INPUT); + + return (core > header) ? core - header : 0; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_size_t +HG_Class_get_output_eager_size(const hg_class_t *hg_class) +{ + hg_size_t core = HG_Core_class_get_output_eager_size(hg_class->core_class), + header = hg_header_get_size(HG_OUTPUT); + + return (core > header) ? core - header : 0; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Class_set_input_offset(hg_class_t *hg_class, hg_size_t offset) +{ + /* Extra input header must not be larger than eager size */ + if (offset > HG_Class_get_input_eager_size(hg_class)) + return HG_INVALID_ARG; + + hg_class->in_offset = offset; + + return HG_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Class_set_output_offset(hg_class_t *hg_class, hg_size_t offset) +{ + /* Extra output header must not be larger than eager size */ + if (offset > HG_Class_get_output_eager_size(hg_class)) + return HG_INVALID_ARG; + + hg_class->out_offset = offset; + + return HG_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Class_set_data(hg_class_t *hg_class, void *data, void (*free_callback)(void *)) +{ + return HG_Core_class_set_data(hg_class->core_class, data, free_callback); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE void * +HG_Class_get_data(const hg_class_t *hg_class) +{ + return HG_Core_class_get_data(hg_class->core_class); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_class_t * + HG_Context_get_class(const hg_context_t *context) +{ + return context->hg_class; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_uint8_t +HG_Context_get_id(const hg_context_t *context) +{ + return HG_Core_context_get_id(context->core_context); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Context_set_data(hg_context_t *context, void *data, void (*free_callback)(void *)) +{ + return HG_Core_context_set_data(context->core_context, data, free_callback); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE void * +HG_Context_get_data(const hg_context_t *context) +{ + return HG_Core_context_get_data(context->core_context); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Ref_incr(hg_handle_t handle) +{ + return HG_Core_ref_incr(handle->core_handle); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_int32_t +HG_Ref_get(hg_handle_t handle) +{ + return HG_Core_ref_get(handle->core_handle); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE const struct hg_info * +HG_Get_info(hg_handle_t handle) +{ + return &handle->info; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Set_data(hg_handle_t handle, void *data, void (*free_callback)(void *)) +{ + handle->data = data; + handle->data_free_callback = free_callback; + + return HG_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE void * +HG_Get_data(hg_handle_t handle) +{ + return handle->data; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Set_target_id(hg_handle_t handle, hg_uint8_t id) +{ + handle->info.context_id = id; + + return HG_Core_set_target_id(handle->core_handle, id); +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_H */ diff --git a/src/mercury/include/mercury_atomic.h b/src/mercury/include/mercury_atomic.h new file mode 100644 index 00000000000..d5a14171b28 --- /dev/null +++ b/src/mercury/include/mercury_atomic.h @@ -0,0 +1,625 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_ATOMIC_H +#define MERCURY_ATOMIC_H + +#include "mercury_util_config.h" + +#if defined(_WIN32) +#include +typedef struct { + volatile LONG value; +} hg_atomic_int32_t; +typedef struct { + volatile LONGLONG value; +} hg_atomic_int64_t; +#define HG_ATOMIC_VAR_INIT(x) \ + { \ + (x) \ + } +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) +#include +typedef OPA_int_t hg_atomic_int32_t; +typedef OPA_ptr_t hg_atomic_int64_t; /* OPA has only limited 64-bit support */ +#define HG_ATOMIC_VAR_INIT(x) OPA_PTR_T_INITIALIZER(x) +#elif defined(HG_UTIL_HAS_STDATOMIC_H) +#ifndef __cplusplus +#include +typedef atomic_int hg_atomic_int32_t; +#if (HG_UTIL_ATOMIC_LONG_WIDTH == 8) && !defined(__APPLE__) +typedef atomic_long hg_atomic_int64_t; +#else +typedef atomic_llong hg_atomic_int64_t; +#endif +#else +#include +typedef std::atomic_int hg_atomic_int32_t; +#if (HG_UTIL_ATOMIC_LONG_WIDTH == 8) && !defined(__APPLE__) +typedef std::atomic_long hg_atomic_int64_t; +#else +typedef std::atomic_llong hg_atomic_int64_t; +#endif +using std::atomic_fetch_add_explicit; +using std::atomic_thread_fence; +using std::memory_order_acq_rel; +using std::memory_order_acquire; +using std::memory_order_release; +#endif +#define HG_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x) +#elif defined(__APPLE__) +#include +typedef struct { + volatile hg_util_int32_t value; +} hg_atomic_int32_t; +typedef struct { + volatile hg_util_int64_t value; +} hg_atomic_int64_t; +#define HG_ATOMIC_VAR_INIT(x) \ + { \ + (x) \ + } +#else +#error "Not supported on this platform." +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Init atomic value (32-bit integer). + * + * \param ptr [OUT] pointer to an atomic32 integer + * \param value [IN] value + */ +static HG_UTIL_INLINE void hg_atomic_init32(hg_atomic_int32_t *ptr, hg_util_int32_t value); + +/** + * Set atomic value (32-bit integer). + * + * \param ptr [OUT] pointer to an atomic32 integer + * \param value [IN] value + */ +static HG_UTIL_INLINE void hg_atomic_set32(hg_atomic_int32_t *ptr, hg_util_int32_t value); + +/** + * Get atomic value (32-bit integer). + * + * \param ptr [OUT] pointer to an atomic32 integer + * + * \return Value of the atomic integer + */ +static HG_UTIL_INLINE hg_util_int32_t hg_atomic_get32(hg_atomic_int32_t *ptr); + +/** + * Increment atomic value (32-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic32 integer + * + * \return Incremented value + */ +static HG_UTIL_INLINE hg_util_int32_t hg_atomic_incr32(hg_atomic_int32_t *ptr); + +/** + * Decrement atomic value (32-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic32 integer + * + * \return Decremented value + */ +static HG_UTIL_INLINE hg_util_int32_t hg_atomic_decr32(hg_atomic_int32_t *ptr); + +/** + * OR atomic value (32-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic32 integer + * \param value [IN] value to OR with + * + * \return Original value + */ +static HG_UTIL_INLINE hg_util_int32_t hg_atomic_or32(hg_atomic_int32_t *ptr, hg_util_int32_t value); + +/** + * XOR atomic value (32-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic32 integer + * \param value [IN] value to XOR with + * + * \return Original value + */ +static HG_UTIL_INLINE hg_util_int32_t hg_atomic_xor32(hg_atomic_int32_t *ptr, hg_util_int32_t value); + +/** + * AND atomic value (32-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic32 integer + * \param value [IN] value to AND with + * + * \return Original value + */ +static HG_UTIL_INLINE hg_util_int32_t hg_atomic_and32(hg_atomic_int32_t *ptr, hg_util_int32_t value); + +/** + * Compare and swap values (32-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic32 integer + * \param compare_value [IN] value to compare to + * \param swap_value [IN] value to swap with if ptr value is equal to + * compare value + * + * \return HG_UTIL_TRUE if swapped or HG_UTIL_FALSE + */ +static HG_UTIL_INLINE hg_util_bool_t hg_atomic_cas32(hg_atomic_int32_t *ptr, hg_util_int32_t compare_value, + hg_util_int32_t swap_value); + +/** + * Init atomic value (64-bit integer). + * + * \param ptr [OUT] pointer to an atomic32 integer + * \param value [IN] value + */ +static HG_UTIL_INLINE void hg_atomic_init64(hg_atomic_int64_t *ptr, hg_util_int64_t value); + +/** + * Set atomic value (64-bit integer). + * + * \param ptr [OUT] pointer to an atomic64 integer + * \param value [IN] value + */ +static HG_UTIL_INLINE void hg_atomic_set64(hg_atomic_int64_t *ptr, hg_util_int64_t value); + +/** + * Get atomic value (64-bit integer). + * + * \param ptr [OUT] pointer to an atomic64 integer + * + * \return Value of the atomic integer + */ +static HG_UTIL_INLINE hg_util_int64_t hg_atomic_get64(hg_atomic_int64_t *ptr); + +/** + * Increment atomic value (64-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic64 integer + * + * \return Incremented value + */ +static HG_UTIL_INLINE hg_util_int64_t hg_atomic_incr64(hg_atomic_int64_t *ptr); + +/** + * Decrement atomic value (64-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic64 integer + * + * \return Decremented value + */ +static HG_UTIL_INLINE hg_util_int64_t hg_atomic_decr64(hg_atomic_int64_t *ptr); + +/** + * OR atomic value (64-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic64 integer + * \param value [IN] value to OR with + * + * \return Original value + */ +static HG_UTIL_INLINE hg_util_int64_t hg_atomic_or64(hg_atomic_int64_t *ptr, hg_util_int64_t value); + +/** + * XOR atomic value (64-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic64 integer + * \param value [IN] value to XOR with + * + * \return Original value + */ +static HG_UTIL_INLINE hg_util_int64_t hg_atomic_xor64(hg_atomic_int64_t *ptr, hg_util_int64_t value); + +/** + * AND atomic value (64-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic64 integer + * \param value [IN] value to AND with + * + * \return Original value + */ +static HG_UTIL_INLINE hg_util_int64_t hg_atomic_and64(hg_atomic_int64_t *ptr, hg_util_int64_t value); + +/** + * Compare and swap values (64-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic64 integer + * \param compare_value [IN] value to compare to + * \param swap_value [IN] value to swap with if ptr value is equal to + * compare value + * + * \return HG_UTIL_TRUE if swapped or HG_UTIL_FALSE + */ +static HG_UTIL_INLINE hg_util_bool_t hg_atomic_cas64(hg_atomic_int64_t *ptr, hg_util_int64_t compare_value, + hg_util_int64_t swap_value); + +/** + * Memory barrier. + * + */ +static HG_UTIL_INLINE void hg_atomic_fence(void); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_atomic_init32(hg_atomic_int32_t *ptr, hg_util_int32_t value) +{ +#if defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + atomic_init(ptr, value); +#else + hg_atomic_set32(ptr, value); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_atomic_set32(hg_atomic_int32_t *ptr, hg_util_int32_t value) +{ +#if defined(_WIN32) + ptr->value = value; +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + OPA_store_int(ptr, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + atomic_store_explicit(ptr, value, memory_order_release); +#elif defined(__APPLE__) + ptr->value = value; +#else +#error "Not supported on this platform." +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int32_t +hg_atomic_get32(hg_atomic_int32_t *ptr) +{ + hg_util_int32_t ret; + +#if defined(_WIN32) + ret = ptr->value; +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = OPA_load_int(ptr); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + ret = atomic_load_explicit(ptr, memory_order_acquire); +#elif defined(__APPLE__) + ret = ptr->value; +#else +#error "Not supported on this platform." +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int32_t +hg_atomic_incr32(hg_atomic_int32_t *ptr) +{ + hg_util_int32_t ret; + +#if defined(_WIN32) + ret = InterlockedIncrementNoFence(&ptr->value); +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = OPA_fetch_and_incr_int(ptr) + 1; +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + ret = atomic_fetch_add_explicit(ptr, 1, memory_order_acq_rel) + 1; +#elif defined(__APPLE__) + ret = OSAtomicIncrement32(&ptr->value); +#else +#error "Not supported on this platform." +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int32_t +hg_atomic_decr32(hg_atomic_int32_t *ptr) +{ + hg_util_int32_t ret; + +#if defined(_WIN32) + ret = InterlockedDecrementNoFence(&ptr->value); +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = OPA_fetch_and_decr_int(ptr) - 1; +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + ret = atomic_fetch_sub_explicit(ptr, 1, memory_order_acq_rel) - 1; +#elif defined(__APPLE__) + ret = OSAtomicDecrement32(&ptr->value); +#else +#error "Not supported on this platform." +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int32_t +hg_atomic_or32(hg_atomic_int32_t *ptr, hg_util_int32_t value) +{ + hg_util_int32_t ret; + +#if defined(_WIN32) + ret = InterlockedOrNoFence(&ptr->value, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_or_explicit(ptr, value, memory_order_acq_rel); +#elif defined(__APPLE__) + ret = OSAtomicOr32Orig((uint32_t)value, (volatile uint32_t *)&ptr->value); +#else + do { + ret = hg_atomic_get32(ptr); + } while (!hg_atomic_cas32(ptr, ret, (ret | value))); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int32_t +hg_atomic_xor32(hg_atomic_int32_t *ptr, hg_util_int32_t value) +{ + hg_util_int32_t ret; + +#if defined(_WIN32) + ret = InterlockedXorNoFence(&ptr->value, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_xor_explicit(ptr, value, memory_order_acq_rel); +#elif defined(__APPLE__) + ret = OSAtomicXor32Orig((uint32_t)value, (volatile uint32_t *)&ptr->value); +#else + do { + ret = hg_atomic_get32(ptr); + } while (!hg_atomic_cas32(ptr, ret, (ret ^ value))); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int32_t +hg_atomic_and32(hg_atomic_int32_t *ptr, hg_util_int32_t value) +{ + hg_util_int32_t ret; + +#if defined(_WIN32) + ret = InterlockedAndNoFence(&ptr->value, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_and_explicit(ptr, value, memory_order_acq_rel); +#elif defined(__APPLE__) + ret = OSAtomicAnd32Orig((uint32_t)value, (volatile uint32_t *)&ptr->value); +#else + do { + ret = hg_atomic_get32(ptr); + } while (!hg_atomic_cas32(ptr, ret, (ret & value))); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_bool_t +hg_atomic_cas32(hg_atomic_int32_t *ptr, hg_util_int32_t compare_value, hg_util_int32_t swap_value) +{ + hg_util_bool_t ret; + +#if defined(_WIN32) + ret = (compare_value == InterlockedCompareExchangeNoFence(&ptr->value, swap_value, compare_value)); +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = (hg_util_bool_t)(compare_value == OPA_cas_int(ptr, compare_value, swap_value)); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + ret = atomic_compare_exchange_strong_explicit(ptr, &compare_value, swap_value, memory_order_acq_rel, + memory_order_acquire); +#elif defined(__APPLE__) + ret = OSAtomicCompareAndSwap32(compare_value, swap_value, &ptr->value); +#else +#error "Not supported on this platform." +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_atomic_init64(hg_atomic_int64_t *ptr, hg_util_int64_t value) +{ +#if defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + atomic_init(ptr, value); +#else + hg_atomic_set64(ptr, value); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_atomic_set64(hg_atomic_int64_t *ptr, hg_util_int64_t value) +{ +#if defined(_WIN32) + ptr->value = value; +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + OPA_store_ptr(ptr, (void *)value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + atomic_store_explicit(ptr, value, memory_order_release); +#elif defined(__APPLE__) + ptr->value = value; +#else +#error "Not supported on this platform." +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int64_t +hg_atomic_get64(hg_atomic_int64_t *ptr) +{ + hg_util_int64_t ret; + +#if defined(_WIN32) + ret = ptr->value; +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = (hg_util_int64_t)OPA_load_ptr(ptr); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + ret = atomic_load_explicit(ptr, memory_order_acquire); +#elif defined(__APPLE__) + ptr->value = value; +#else +#error "Not supported on this platform." +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int64_t +hg_atomic_incr64(hg_atomic_int64_t *ptr) +{ + hg_util_int64_t ret; + +#if defined(_WIN32) + ret = InterlockedIncrementNoFence64(&ptr->value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_add_explicit(ptr, 1L, memory_order_acq_rel) + 1; +#elif defined(__APPLE__) + ret = OSAtomicIncrement64(&ptr->value); +#else + do { + ret = hg_atomic_get64(ptr); + } while (!hg_atomic_cas64(ptr, ret, ret + 1)); + ret++; +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int64_t +hg_atomic_decr64(hg_atomic_int64_t *ptr) +{ + hg_util_int64_t ret; + +#if defined(_WIN32) + ret = InterlockedDecrementNoFence64(&ptr->value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_sub_explicit(ptr, 1L, memory_order_acq_rel) - 1; +#elif defined(__APPLE__) + ret = OSAtomicDecrement64(&ptr->value); +#else + do { + ret = hg_atomic_get64(ptr); + } while (!hg_atomic_cas64(ptr, ret, ret - 1)); + ret--; +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int64_t +hg_atomic_or64(hg_atomic_int64_t *ptr, hg_util_int64_t value) +{ + hg_util_int64_t ret; + +#if defined(_WIN32) + ret = InterlockedOr64NoFence(&ptr->value, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_or_explicit(ptr, value, memory_order_acq_rel); +#else + do { + ret = hg_atomic_get64(ptr); + } while (!hg_atomic_cas64(ptr, ret, (ret | value))); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int64_t +hg_atomic_xor64(hg_atomic_int64_t *ptr, hg_util_int64_t value) +{ + hg_util_int64_t ret; + +#if defined(_WIN32) + ret = InterlockedXor64NoFence(&ptr->value, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_xor_explicit(ptr, value, memory_order_acq_rel); +#else + do { + ret = hg_atomic_get64(ptr); + } while (!hg_atomic_cas64(ptr, ret, (ret ^ value))); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int64_t +hg_atomic_and64(hg_atomic_int64_t *ptr, hg_util_int64_t value) +{ + hg_util_int64_t ret; + +#if defined(_WIN32) + ret = InterlockedAnd64NoFence(&ptr->value, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_and_explicit(ptr, value, memory_order_acq_rel); +#else + do { + ret = hg_atomic_get64(ptr); + } while (!hg_atomic_cas64(ptr, ret, (ret & value))); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_bool_t +hg_atomic_cas64(hg_atomic_int64_t *ptr, hg_util_int64_t compare_value, hg_util_int64_t swap_value) +{ + hg_util_bool_t ret; + +#if defined(_WIN32) + ret = (compare_value == InterlockedCompareExchangeNoFence64(&ptr->value, swap_value, compare_value)); +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = (hg_util_bool_t)(compare_value == + (hg_util_int64_t)OPA_cas_ptr(ptr, (void *)compare_value, (void *)swap_value)); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + ret = atomic_compare_exchange_strong_explicit(ptr, &compare_value, swap_value, memory_order_acq_rel, + memory_order_acquire); +#elif defined(__APPLE__) + ret = OSAtomicCompareAndSwap64(compare_value, swap_value, &ptr->value); +#else +#error "Not supported on this platform." +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_atomic_fence() +{ +#if defined(_WIN32) + MemoryBarrier(); +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + OPA_read_write_barrier(); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + atomic_thread_fence(memory_order_acq_rel); +#elif defined(__APPLE__) + OSMemoryBarrier(); +#else +#error "Not supported on this platform." +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_ATOMIC_H */ diff --git a/src/mercury/include/mercury_atomic_queue.h b/src/mercury/include/mercury_atomic_queue.h new file mode 100644 index 00000000000..61b5128df1c --- /dev/null +++ b/src/mercury/include/mercury_atomic_queue.h @@ -0,0 +1,266 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Implementation derived from: + * https://github.com/freebsd/freebsd/blob/master/sys/sys/buf_ring.h + * + * - + * Copyright (c) 2007-2009 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#ifndef MERCURY_ATOMIC_QUEUE_H +#define MERCURY_ATOMIC_QUEUE_H + +#include "mercury_atomic.h" +#include "mercury_mem.h" + +/* For busy loop spinning */ +#ifndef cpu_spinwait +#if defined(_WIN32) +#define cpu_spinwait YieldProcessor +#elif defined(__x86_64__) || defined(__i386__) +#include +#define cpu_spinwait _mm_pause +#elif defined(__arm__) +#define cpu_spinwait() __asm__ __volatile__("yield") +#else +#warning "Processor yield is not supported on this architecture." +#define cpu_spinwait(x) +#endif +#endif + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +struct hg_atomic_queue { + hg_atomic_int32_t prod_head; + hg_atomic_int32_t prod_tail; + unsigned int prod_size; + unsigned int prod_mask; + hg_util_uint64_t drops; + hg_atomic_int32_t cons_head __attribute__((aligned(HG_MEM_CACHE_LINE_SIZE))); + hg_atomic_int32_t cons_tail; + unsigned int cons_size; + unsigned int cons_mask; + hg_atomic_int64_t ring[] __attribute__((aligned(HG_MEM_CACHE_LINE_SIZE))); +}; + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Allocate a new queue that can hold \count elements. + * + * \param count [IN] maximum number of elements + * + * \return pointer to allocated queue or NULL on failure + */ +HG_UTIL_PUBLIC struct hg_atomic_queue *hg_atomic_queue_alloc(unsigned int count); + +/** + * Free an existing queue. + * + * \param hg_atomic_queue [IN] pointer to queue + */ +HG_UTIL_PUBLIC void hg_atomic_queue_free(struct hg_atomic_queue *hg_atomic_queue); + +/** + * Push an entry to the queue. + * + * \param hg_atomic_queue [IN/OUT] pointer to queue + * \param entry [IN] pointer to object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_atomic_queue_push(struct hg_atomic_queue *hg_atomic_queue, void *entry); + +/** + * Pop an entry from the queue (multi-consumer). + * + * \param hg_atomic_queue [IN/OUT] pointer to queue + * + * \return Pointer to popped object or NULL if queue is empty + */ +static HG_UTIL_INLINE void *hg_atomic_queue_pop_mc(struct hg_atomic_queue *hg_atomic_queue); + +/** + * Pop an entry from the queue (single consumer). + * + * \param hg_atomic_queue [IN/OUT] pointer to queue + * + * \return Pointer to popped object or NULL if queue is empty + */ +static HG_UTIL_INLINE void *hg_atomic_queue_pop_sc(struct hg_atomic_queue *hg_atomic_queue); + +/** + * Determine whether queue is empty. + * + * \param hg_atomic_queue [IN/OUT] pointer to queue + * + * \return HG_UTIL_TRUE if empty, HG_UTIL_FALSE if not + */ +static HG_UTIL_INLINE hg_util_bool_t hg_atomic_queue_is_empty(struct hg_atomic_queue *hg_atomic_queue); + +/** + * Determine number of entries in a queue. + * + * \param hg_atomic_queue [IN/OUT] pointer to queue + * + * \return Number of entries queued or 0 if none + */ +static HG_UTIL_INLINE unsigned int hg_atomic_queue_count(struct hg_atomic_queue *hg_atomic_queue); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_atomic_queue_push(struct hg_atomic_queue *hg_atomic_queue, void *entry) +{ + hg_util_int32_t prod_head, prod_next, cons_tail; + + do { + prod_head = hg_atomic_get32(&hg_atomic_queue->prod_head); + prod_next = (prod_head + 1) & (int)hg_atomic_queue->prod_mask; + cons_tail = hg_atomic_get32(&hg_atomic_queue->cons_tail); + + if (prod_next == cons_tail) { + hg_atomic_fence(); + if (prod_head == hg_atomic_get32(&hg_atomic_queue->prod_head) && + cons_tail == hg_atomic_get32(&hg_atomic_queue->cons_tail)) { + hg_atomic_queue->drops++; + /* Full */ + return HG_UTIL_FAIL; + } + continue; + } + } while (!hg_atomic_cas32(&hg_atomic_queue->prod_head, prod_head, prod_next)); + + hg_atomic_set64(&hg_atomic_queue->ring[prod_head], (hg_util_int64_t)entry); + + /* + * If there are other enqueues in progress + * that preceded us, we need to wait for them + * to complete + */ + while (hg_atomic_get32(&hg_atomic_queue->prod_tail) != prod_head) + cpu_spinwait(); + + hg_atomic_set32(&hg_atomic_queue->prod_tail, prod_next); + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void * +hg_atomic_queue_pop_mc(struct hg_atomic_queue *hg_atomic_queue) +{ + hg_util_int32_t cons_head, cons_next; + void * entry = NULL; + + do { + cons_head = hg_atomic_get32(&hg_atomic_queue->cons_head); + cons_next = (cons_head + 1) & (int)hg_atomic_queue->cons_mask; + + if (cons_head == hg_atomic_get32(&hg_atomic_queue->prod_tail)) + return NULL; + } while (!hg_atomic_cas32(&hg_atomic_queue->cons_head, cons_head, cons_next)); + + entry = (void *)hg_atomic_get64(&hg_atomic_queue->ring[cons_head]); + + /* + * If there are other dequeues in progress + * that preceded us, we need to wait for them + * to complete + */ + while (hg_atomic_get32(&hg_atomic_queue->cons_tail) != cons_head) + cpu_spinwait(); + + hg_atomic_set32(&hg_atomic_queue->cons_tail, cons_next); + + return entry; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void * +hg_atomic_queue_pop_sc(struct hg_atomic_queue *hg_atomic_queue) +{ + hg_util_int32_t cons_head, cons_next; + hg_util_int32_t prod_tail; + void * entry = NULL; + + cons_head = hg_atomic_get32(&hg_atomic_queue->cons_head); + prod_tail = hg_atomic_get32(&hg_atomic_queue->prod_tail); + cons_next = (cons_head + 1) & (int)hg_atomic_queue->cons_mask; + + if (cons_head == prod_tail) + /* Empty */ + return NULL; + + hg_atomic_set32(&hg_atomic_queue->cons_head, cons_next); + + entry = (void *)hg_atomic_get64(&hg_atomic_queue->ring[cons_head]); + + hg_atomic_set32(&hg_atomic_queue->cons_tail, cons_next); + + return entry; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_bool_t +hg_atomic_queue_is_empty(struct hg_atomic_queue *hg_atomic_queue) +{ + return (hg_atomic_get32(&hg_atomic_queue->cons_head) == hg_atomic_get32(&hg_atomic_queue->prod_tail)); +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE unsigned int +hg_atomic_queue_count(struct hg_atomic_queue *hg_atomic_queue) +{ + return ((hg_atomic_queue->prod_size + (unsigned int)hg_atomic_get32(&hg_atomic_queue->prod_tail) - + (unsigned int)hg_atomic_get32(&hg_atomic_queue->cons_tail)) & + hg_atomic_queue->prod_mask); +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_ATOMIC_QUEUE_H */ diff --git a/src/mercury/include/mercury_bulk.h b/src/mercury/include/mercury_bulk.h new file mode 100644 index 00000000000..598a842be13 --- /dev/null +++ b/src/mercury/include/mercury_bulk.h @@ -0,0 +1,336 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_BULK_H +#define MERCURY_BULK_H + +#include "mercury_types.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/*****************/ +/* Public Macros */ +/*****************/ + +/* The memory attributes associated with the bulk handle + * can be defined as read only, write only or read-write */ +#define HG_BULK_READ_ONLY (1 << 0) +#define HG_BULK_WRITE_ONLY (1 << 1) +#define HG_BULK_READWRITE (HG_BULK_READ_ONLY | HG_BULK_WRITE_ONLY) + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Create an abstract bulk handle from specified memory segments. + * Memory allocated is then freed when HG_Bulk_free() is called. + * \remark If NULL is passed to buf_ptrs, i.e., + * \verbatim HG_Bulk_create(count, NULL, buf_sizes, flags, &handle) \endverbatim + * memory for the missing buf_ptrs array will be internally allocated. + * + * \param hg_class [IN] pointer to HG class + * \param count [IN] number of segments + * \param buf_ptrs [IN] array of pointers + * \param buf_sizes [IN] array of sizes + * \param flags [IN] permission flag: + * - HG_BULK_READWRITE + * - HG_BULK_READ_ONLY + * - HG_BULK_WRITE_ONLY + * \param handle [OUT] pointer to returned abstract bulk handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Bulk_create(hg_class_t *hg_class, hg_uint32_t count, void **buf_ptrs, + const hg_size_t *buf_sizes, hg_uint8_t flags, hg_bulk_t *handle); + +/** + * Free bulk handle. + * + * \param handle [IN/OUT] abstract bulk handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Bulk_free(hg_bulk_t handle); + +/** + * Increment ref count on bulk handle. + * + * \param handle [IN] abstract bulk handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Bulk_ref_incr(hg_bulk_t handle); + +/** + * Bind an existing bulk handle to a local HG context and associate its local + * address. This function can be used to forward and share a bulk handle + * between targets, which would not have direct access to the origin without + * extra RPCs. In that case, the origin address of the bulk handle is embedded + * and serialized/deserialized with HG_Bulk_serialize()/HG_Bulk_deserialize(). + * Users should note that binding a handle adds an extra overhead on + * serialization, therefore it is recommended to use it with care. + * When binding a handle on origin, HG_Bulk_bind_transfer() can be used since + * origin information is embedded in the handle. + * + * Usage example: + * Origin sends an RPC request with a bulk handle attached to target A, target A + * forwards the origin's bulk handle to another target B. When target B receives + * the deserialized bulk handle, it has the address/info required to initiate a + * bulk transfer to/from the origin. + * For that usage, the origin will have called this function to bind the bulk + * handle to its local context, prior to sending the RPC request to target A. + * + * \param context [IN] pointer to HG context + * \param handle [IN] abstract bulk handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Bulk_bind(hg_bulk_t handle, hg_context_t *context); + +/** + * Return attached addressing information from a handle that was previously + * bound to a context using HG_Bulk_bind(). + * + * \param handle [IN] abstract bulk handle + * + * \return abstract HG address or HG_ADDR_NULL in case of error + */ +HG_PUBLIC hg_addr_t HG_Bulk_get_addr(hg_bulk_t handle); + +/** + * Return attached context ID from a handle that was previously bound to a + * context using HG_Bulk_bind(). + * + * \param handle [IN] abstract bulk handle + * + * \return valid context ID or 0 by default + */ +HG_PUBLIC hg_uint8_t HG_Bulk_get_context_id(hg_bulk_t handle); + +/** + * Access bulk handle to retrieve memory segments abstracted by handle. + * \remark When using mercury in co-resident mode (i.e., when addr passed is + * self addr), this function allows to avoid copy of bulk data by directly + * accessing pointers from an existing HG bulk handle. + * + * \param handle [IN] abstract bulk handle + * \param offset [IN] bulk offset + * \param size [IN] bulk size + * \param flags [IN] permission flag: + * - HG_BULK_READWRITE + * - HG_BULK_READ_ONLY + * \param max_count [IN] maximum number of segments to be returned + * \param buf_ptrs [IN/OUT] array of buffer pointers + * \param buf_sizes [IN/OUT] array of buffer sizes + * \param actual_count [OUT] actual number of segments returned + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Bulk_access(hg_bulk_t handle, hg_size_t offset, hg_size_t size, hg_uint8_t flags, + hg_uint32_t max_count, void **buf_ptrs, hg_size_t *buf_sizes, + hg_uint32_t *actual_count); + +/** + * Get total size of data abstracted by bulk handle. + * + * \param handle [IN] abstract bulk handle + * + * \return Non-negative value + */ +static HG_INLINE hg_size_t HG_Bulk_get_size(hg_bulk_t handle); + +/** + * Get total number of segments abstracted by bulk handle. + * + * \param handle [IN] abstract bulk handle + * + * \return Non-negative value + */ +static HG_INLINE hg_uint32_t HG_Bulk_get_segment_count(hg_bulk_t handle); + +/** + * Get permission flags set on an existing bulk handle. + * + * \param handle [IN] abstract bulk handle + * + * \return Non-negative value + */ +static HG_INLINE hg_uint8_t HG_Bulk_get_flags(hg_bulk_t handle); + +/** + * Get size required to serialize bulk handle. + * + * \param handle [IN] abstract bulk handle + * \param flags [IN] option flags, valid flags are: + * HG_BULK_SM, HG_BULK_EAGER + * + * \return Non-negative value + */ +HG_PUBLIC hg_size_t HG_Bulk_get_serialize_size(hg_bulk_t handle, unsigned long flags); + +/** + * Serialize bulk handle into a buffer. + * + * \param buf [IN/OUT] pointer to buffer + * \param buf_size [IN] buffer size + * \param flags [IN] option flags, valid flags are: + * HG_BULK_SM, HG_BULK_EAGER + * \param handle [IN] abstract bulk handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Bulk_serialize(void *buf, hg_size_t buf_size, unsigned long flags, hg_bulk_t handle); + +/** + * Deserialize bulk handle from an existing buffer. + * + * \param hg_class [IN] pointer to HG class + * \param handle [OUT] abstract bulk handle + * \param buf [IN] pointer to buffer + * \param buf_size [IN] buffer size + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Bulk_deserialize(hg_class_t *hg_class, hg_bulk_t *handle, const void *buf, + hg_size_t buf_size); + +/** + * Transfer data to/from origin using abstract bulk handles and explicit origin + * address information. After completion, user callback is placed into a + * completion queue and can be triggered using HG_Trigger(). + * + * \param context [IN] pointer to HG context + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param op [IN] transfer operation: + * - HG_BULK_PUSH + * - HG_BULK_PULL + * \param origin_addr [IN] abstract address of origin + * \param origin_handle [IN] abstract bulk handle + * \param origin_offset [IN] offset + * \param local_handle [IN] abstract bulk handle + * \param local_offset [IN] offset + * \param size [IN] size of data to be transferred + * \param op_id [OUT] pointer to returned operation ID + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Bulk_transfer(hg_context_t *context, hg_cb_t callback, void *arg, hg_bulk_op_t op, + hg_addr_t origin_addr, hg_bulk_t origin_handle, + hg_size_t origin_offset, hg_bulk_t local_handle, + hg_size_t local_offset, hg_size_t size, hg_op_id_t *op_id); + +/** + * Transfer data to/from origin using abstract bulk handles and implicit origin + * information (embedded in the origin handle). After completion, user callback + * is placed into a completion queue and can be triggered using HG_Trigger(). + * + * \param context [IN] pointer to HG context + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param op [IN] transfer operation: + * - HG_BULK_PUSH + * - HG_BULK_PULL + * \param origin_handle [IN] abstract bulk handle + * \param origin_offset [IN] offset + * \param local_handle [IN] abstract bulk handle + * \param local_offset [IN] offset + * \param size [IN] size of data to be transferred + * \param op_id [OUT] pointer to returned operation ID + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Bulk_bind_transfer(hg_context_t *context, hg_cb_t callback, void *arg, + hg_bulk_op_t op, hg_bulk_t origin_handle, hg_size_t origin_offset, + hg_bulk_t local_handle, hg_size_t local_offset, hg_size_t size, + hg_op_id_t *op_id); + +/** + * Transfer data to/from origin using abstract bulk handles, explicit origin + * address information and origin context ID (associating the transfer to a + * remote context ID). After completion, user callback is placed into a + * completion queue and can be triggered using HG_Trigger(). + * + * \param context [IN] pointer to HG context + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param op [IN] transfer operation: + * - HG_BULK_PUSH + * - HG_BULK_PULL + * \param origin_addr [IN] abstract address of origin + * \param origin_id [IN] context ID of origin + * \param origin_handle [IN] abstract bulk handle + * \param origin_offset [IN] offset + * \param local_handle [IN] abstract bulk handle + * \param local_offset [IN] offset + * \param size [IN] size of data to be transferred + * \param op_id [OUT] pointer to returned operation ID + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Bulk_transfer_id(hg_context_t *context, hg_cb_t callback, void *arg, hg_bulk_op_t op, + hg_addr_t origin_addr, hg_uint8_t origin_id, + hg_bulk_t origin_handle, hg_size_t origin_offset, + hg_bulk_t local_handle, hg_size_t local_offset, hg_size_t size, + hg_op_id_t *op_id); + +/** + * Cancel an ongoing operation. + * + * \param op_id [IN] operation ID + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Bulk_cancel(hg_op_id_t op_id); + +/************************************/ +/* Local Type and Struct Definition */ +/************************************/ + +/* HG bulk descriptor info */ +struct hg_bulk_desc_info { + hg_size_t len; /* Size of region */ + hg_uint32_t segment_count; /* Segment count */ + hg_uint8_t flags; /* Flags of operation access */ +}; + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_size_t +HG_Bulk_get_size(hg_bulk_t handle) +{ + return ((struct hg_bulk_desc_info *)handle)->len; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_uint32_t +HG_Bulk_get_segment_count(hg_bulk_t handle) +{ + return ((struct hg_bulk_desc_info *)handle)->segment_count; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_uint8_t +HG_Bulk_get_flags(hg_bulk_t handle) +{ + return ((struct hg_bulk_desc_info *)handle)->flags; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_BULK_H */ diff --git a/src/mercury/include/mercury_config.h b/src/mercury/include/mercury_config.h new file mode 100644 index 00000000000..6fe5064efe4 --- /dev/null +++ b/src/mercury/include/mercury_config.h @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Generated file. Only edit mercury_config.h.in. */ + +#ifndef MERCURY_CONFIG_H +#define MERCURY_CONFIG_H + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/* Type definitions */ +#ifdef _WIN32 +typedef signed __int64 hg_int64_t; +typedef signed __int32 hg_int32_t; +typedef signed __int16 hg_int16_t; +typedef signed __int8 hg_int8_t; +typedef unsigned __int64 hg_uint64_t; +typedef unsigned __int32 hg_uint32_t; +typedef unsigned __int16 hg_uint16_t; +typedef unsigned __int8 hg_uint8_t; +/* Limits on Integer Constants */ +#define UINT64_MAX _UI64_MAX +#else +#include +#include +typedef int64_t hg_int64_t; +typedef int32_t hg_int32_t; +typedef int16_t hg_int16_t; +typedef int8_t hg_int8_t; +typedef uint64_t hg_uint64_t; +typedef uint32_t hg_uint32_t; +typedef uint16_t hg_uint16_t; +typedef uint8_t hg_uint8_t; +#endif +typedef hg_uint64_t hg_ptr_t; +typedef hg_uint8_t hg_bool_t; + +/* True / false */ +#define HG_TRUE 1 +#define HG_FALSE 0 + +/*****************/ +/* Public Macros */ +/*****************/ + +/* Reflects major releases of Mercury */ +#define HG_VERSION_MAJOR 2 +/* Reflects any API changes */ +#define HG_VERSION_MINOR 1 +/* Reflects any library code changes */ +#define HG_VERSION_PATCH 0 + +/* Visibility of symbols */ +#if defined(_WIN32) +#define HG_ABI_IMPORT __declspec(dllimport) +#define HG_ABI_EXPORT __declspec(dllexport) +#define HG_ABI_HIDDEN +#elif defined(__GNUC__) && (__GNUC__ >= 4) +#define HG_ABI_IMPORT __attribute__((visibility("default"))) +#define HG_ABI_EXPORT __attribute__((visibility("default"))) +#define HG_ABI_HIDDEN __attribute__((visibility("hidden"))) +#else +#define HG_ABI_IMPORT +#define HG_ABI_EXPORT +#define HG_ABI_HIDDEN +#endif + +/* Inline macro */ +#ifdef _WIN32 +#define HG_INLINE __inline +#else +#define HG_INLINE __inline__ +#endif + +/* Fallthrough macro */ +#if defined(__GNUC__) && (__GNUC__ >= 7) +#define HG_FALLTHROUGH() __attribute__((fallthrough)) +#else +#define HG_FALLTHROUGH() +#endif + +/* Shared libraries */ +/* #undef HG_BUILD_SHARED_LIBS */ +#ifdef HG_BUILD_SHARED_LIBS +#ifdef mercury_EXPORTS +#define HG_PUBLIC HG_ABI_EXPORT +#else +#define HG_PUBLIC HG_ABI_IMPORT +#endif +#define HG_PRIVATE HG_ABI_HIDDEN +#else +#define HG_PUBLIC +#define HG_PRIVATE +#endif + +/* Build Options */ +/* #undef HG_HAS_BOOST */ +/* #undef HG_HAS_CHECKSUMS */ +/* #undef HG_HAS_XDR */ +/* #undef HG_HAS_COLLECT_STATS */ + +/* #undef HG_HAS_DEBUG */ + +#endif /* MERCURY_CONFIG_H */ diff --git a/src/mercury/include/mercury_core.h b/src/mercury/include/mercury_core.h new file mode 100644 index 00000000000..3d5c850190b --- /dev/null +++ b/src/mercury/include/mercury_core.h @@ -0,0 +1,1074 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_CORE_H +#define MERCURY_CORE_H + +#include "mercury_core_header.h" +#include "mercury_core_types.h" + +#include "na.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +typedef struct hg_core_class hg_core_class_t; /* Opaque HG core class */ +typedef struct hg_core_context hg_core_context_t; /* Opaque HG core context */ +typedef struct hg_core_addr * hg_core_addr_t; /* Abstract HG address */ +typedef struct hg_core_handle *hg_core_handle_t; /* Abstract RPC handle */ +typedef struct hg_core_op_id * hg_core_op_id_t; /* Abstract operation id */ + +/* HG info struct */ +struct hg_core_info { + hg_core_class_t * core_class; /* HG core class */ + hg_core_context_t *context; /* HG core context */ + hg_core_addr_t addr; /* HG address at target/origin */ + hg_id_t id; /* RPC ID */ + hg_uint8_t context_id; /* Context ID at target/origin */ +}; + +/* Callback info structs */ +struct hg_core_cb_info_lookup { + hg_core_addr_t addr; /* HG address */ +}; + +struct hg_core_cb_info_forward { + hg_core_handle_t handle; /* HG handle */ +}; + +struct hg_core_cb_info_respond { + hg_core_handle_t handle; /* HG handle */ +}; + +struct hg_core_cb_info { + union { /* Union of callback info structures */ + struct hg_core_cb_info_lookup lookup; + struct hg_core_cb_info_forward forward; + struct hg_core_cb_info_respond respond; + } info; + void * arg; /* User data */ + hg_cb_type_t type; /* Callback type */ + hg_return_t ret; /* Return value */ +}; + +/* RPC / HG callbacks */ +typedef hg_return_t (*hg_core_rpc_cb_t)(hg_core_handle_t handle); +typedef hg_return_t (*hg_core_cb_t)(const struct hg_core_cb_info *callback_info); + +/*****************/ +/* Public Macros */ +/*****************/ + +/* Constant values */ +#define HG_CORE_ADDR_NULL ((hg_core_addr_t)0) +#define HG_CORE_HANDLE_NULL ((hg_core_handle_t)0) +#define HG_CORE_OP_ID_NULL ((hg_core_op_id_t)0) +#define HG_CORE_OP_ID_IGNORE ((hg_core_op_id_t *)1) + +/* Flags */ +#define HG_CORE_MORE_DATA (1 << 0) /* More data required */ +#define HG_CORE_NO_RESPONSE (1 << 1) /* No response required */ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the core Mercury layer. + * Must be finalized with HG_Core_finalize(). + * + * \param na_info_string [IN] host address with port number (e.g., + * "tcp://localhost:3344" or + * "bmi+tcp://localhost:3344") + * \param na_listen [IN] listen for incoming connections + * + * \return Pointer to HG core class or NULL in case of failure + */ +HG_PUBLIC hg_core_class_t *HG_Core_init(const char *na_info_string, hg_bool_t na_listen); + +/** + * Initialize the Mercury layer with options provided by init_info. + * Must be finalized with HG_Core_finalize(). + * \remark HG_Core_init_opt() may become HG_Core_init() in the future. + * + * \param na_info_string [IN] host address with port number (e.g., + * "tcp://localhost:3344" or + * "bmi+tcp://localhost:3344") + * \param na_listen [IN] listen for incoming connections + * \param hg_init_info [IN] (Optional) HG init info, NULL if no info + * + * \return Pointer to HG core class or NULL in case of failure + */ +HG_PUBLIC hg_core_class_t *HG_Core_init_opt(const char *na_info_string, hg_bool_t na_listen, + const struct hg_init_info *hg_init_info); + +/** + * Finalize the Mercury layer. + * + * \param hg_core_class [IN] pointer to HG core class + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_finalize(hg_core_class_t *hg_core_class); + +/** + * Clean up all temporary files that were created in previous HG instances. + * While temporary resources (e.g., tmp files) are cleaned up on a call + * to HG_Finalize(), this routine gives a chance to programs that terminate + * abnormally to easily clean up those resources. + */ +HG_PUBLIC void HG_Core_cleanup(void); + +/** + * Set callback that will be triggered when additional data needs to be + * transferred and HG_Core_set_more_data() has been called, usually when the + * eager message size is exceeded. This allows upper layers to manually transfer + * data using bulk transfers for example. The done_callback argument allows the + * upper layer to notify back once the data has been successfully acquired. + * The release callback allows the upper layer to release resources that were + * allocated when acquiring the data. + * + * \param hg_core_class [IN] pointer to HG core class + * \param more_data_acquire_callback [IN] pointer to acquire function callback + * \param more_data_release_callback [IN] pointer to release function callback + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_set_more_data_callback( + struct hg_core_class *hg_core_class, + hg_return_t (*more_data_acquire_callback)(hg_core_handle_t, hg_op_t, + hg_return_t (*done_callback)(hg_core_handle_t)), + void (*more_data_release_callback)(hg_core_handle_t)); + +/** + * Obtain the name of the given class. + * + * \param hg_core_class [IN] pointer to HG core class + * + * \return the name of the class, or NULL if not a valid class + */ +static HG_INLINE const char *HG_Core_class_get_name(const hg_core_class_t *hg_core_class); + +/** + * Obtain the protocol of the given class. + * + * \param hg_core_class [IN] pointer to HG core class + * + * \return the protocol of the class, or NULL if not a valid class + */ +static HG_INLINE const char *HG_Core_class_get_protocol(const hg_core_class_t *hg_core_class); + +/** + * Test whether class is listening or not. + * + * \param hg_core_class [IN] pointer to HG core class + * + * \return HG_TRUE if listening or HG_FALSE if not, or not a valid class + */ +static HG_INLINE hg_bool_t HG_Core_class_is_listening(const hg_core_class_t *hg_core_class); + +/** + * Obtain the underlying NA class. + * + * \param hg_core_class [IN] pointer to HG core class + * + * \return Pointer to NA class or NULL if not a valid class + */ +static HG_INLINE na_class_t *HG_Core_class_get_na(const hg_core_class_t *hg_core_class); + +#ifdef NA_HAS_SM +/** + * Obtain the underlying NA SM class. + * + * \param hg_core_class [IN] pointer to HG core class + * + * \return Pointer to NA SM class or NULL if not a valid class + */ +static HG_INLINE na_class_t *HG_Core_class_get_na_sm(const hg_core_class_t *hg_core_class); +#endif + +/** + * Obtain the maximum eager size for sending RPC inputs. + * + * \param hg_core_class [IN] pointer to HG core class + * + * \return the maximum size, or 0 if hg_core_class is not a valid class or + * XDR is being used + */ +static HG_INLINE hg_size_t HG_Core_class_get_input_eager_size(const hg_core_class_t *hg_core_class); + +/** + * Obtain the maximum eager size for sending RPC outputs. + * + * \param hg_core_class [IN] pointer to HG core class + * + * \return the maximum size, or 0 if hg_core_class is not a valid class or XDR + * is being used + */ +static HG_INLINE hg_size_t HG_Core_class_get_output_eager_size(const hg_core_class_t *hg_core_class); + +/** + * Associate user data to class. When HG_Core_finalize() is called, + * free_callback (if defined) is called to free the associated data. + * + * \param hg_core_class [IN] pointer to HG core class + * \param data [IN] pointer to user data + * \param free_callback [IN] pointer to function + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Core_class_set_data(hg_core_class_t *hg_core_class, void *data, + void (*free_callback)(void *)); + +/** + * Retrieve previously associated data from a given class. + * + * \param hg_core_class [IN] pointer to HG core class + * + * \return Pointer to user data or NULL if not set or any error has occurred + */ +static HG_INLINE void *HG_Core_class_get_data(const hg_core_class_t *hg_core_class); + +/** + * Create a new context. Must be destroyed by calling HG_Core_context_destroy(). + * + * \param hg_core_class [IN] pointer to HG core class + * + * \return Pointer to HG core context or NULL in case of failure + */ +HG_PUBLIC hg_core_context_t *HG_Core_context_create(hg_core_class_t *hg_core_class); + +/** + * Create a new context with a user-defined context identifier. The context + * identifier can be used to route RPC requests to specific contexts by using + * HG_Core_set_target_id(). + * Context must be destroyed by calling HG_Core_context_destroy(). + * + * \param hg_core_class [IN] pointer to HG core class + * \param id [IN] context ID + * + * \return Pointer to HG core context or NULL in case of failure + */ +HG_PUBLIC hg_core_context_t *HG_Core_context_create_id(hg_core_class_t *hg_core_class, hg_uint8_t id); + +/** + * Destroy a context created by HG_Core_context_create(). + * + * \param context [IN] pointer to HG core context + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_context_destroy(hg_core_context_t *context); + +/** + * Retrieve the class used to create the given context. + * + * \param context [IN] pointer to HG core context + * + * \return the associated class + */ +static HG_INLINE hg_core_class_t *HG_Core_context_get_class(const hg_core_context_t *context); + +/** + * Retrieve the underlying NA context. + * + * \param context [IN] pointer to HG core context + * + * \return the associated context + */ +static HG_INLINE na_context_t *HG_Core_context_get_na(const hg_core_context_t *context); + +#ifdef NA_HAS_SM +/** + * Retrieve the underlying NA SM context. + * + * \param context [IN] pointer to HG core context + * + * \return the associated context + */ +static HG_INLINE na_context_t *HG_Core_context_get_na_sm(const hg_core_context_t *context); +#endif + +/** + * Retrieve context ID from context. + * + * \param context [IN] pointer to HG core context + * + * \return Non-negative integer (max value of 255) or 0 if no ID has been set + */ +static HG_INLINE hg_uint8_t HG_Core_context_get_id(const hg_core_context_t *context); + +/** + * Associate user data to context. When HG_Core_context_destroy() is called, + * free_callback (if defined) is called to free the associated data. + * + * \param context [IN] pointer to HG core context + * \param data [IN] pointer to user data + * \param free_callback [IN] pointer to function + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Core_context_set_data(hg_core_context_t *context, void *data, + void (*free_callback)(void *)); + +/** + * Retrieve previously associated data from a given context. + * + * \param context [IN] pointer to HG core context + * + * \return Pointer to user data or NULL if not set or any error has occurred + */ +static HG_INLINE void *HG_Core_context_get_data(const hg_core_context_t *context); + +/** + * Set callback to be called on HG core handle creation. Handles are created + * both on HG_Core_create() and HG_Core_context_post() calls. This allows + * upper layers to create and attach data to a handle (using HG_Core_set_data()) + * and later retrieve it using HG_Core_get_data(). + * + * \param context [IN] pointer to HG core context + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_context_set_handle_create_callback( + hg_core_context_t *context, hg_return_t (*callback)(hg_core_handle_t, void *), void *arg); + +/** + * Post requests associated to context in order to receive incoming RPCs. + * Requests are automatically re-posted after completion until the context is + * destroyed. Additionally a callback can be triggered on HG handle + * creation. This allows upper layers to instantiate data that needs to be + * attached to a handle. Number of requests that are posted can be controlled + * through HG init info. + * + * \param context [IN] pointer to HG core context + * + * \return the associated class + */ +HG_PUBLIC hg_return_t HG_Core_context_post(hg_core_context_t *context); + +/** + * Dynamically register an RPC ID as well as the RPC callback executed + * when the RPC request ID is received. + * + * \param hg_core_class [IN] pointer to HG core class + * \param id [IN] ID to use to register RPC + * \param rpc_cb [IN] RPC callback + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_register(hg_core_class_t *hg_core_class, hg_id_t id, hg_core_rpc_cb_t rpc_cb); + +/** + * Deregister RPC ID. Further requests with RPC ID will return an error, it + * is therefore up to the user to make sure that all requests for that RPC ID + * have been treated before it is unregistered. + * + * \param hg_core_class [IN] pointer to HG core class + * \param id [IN] registered function ID + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_deregister(hg_core_class_t *hg_core_class, hg_id_t id); + +/** + * Indicate whether HG_Core_register() has been called. + * + * \param hg_core_class [IN] pointer to HG core class + * \param id [IN] function ID + * \param flag [OUT] pointer to boolean + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_registered(hg_core_class_t *hg_core_class, hg_id_t id, hg_bool_t *flag); + +/** + * Register and associate user data to registered function. When + * HG_Core_finalize() is called, free_callback (if defined) is called to free + * the registered data. + * + * \param hg_core_class [IN] pointer to HG core class + * \param id [IN] registered function ID + * \param data [IN] pointer to data + * \param free_callback [IN] pointer to function + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_register_data(hg_core_class_t *hg_core_class, hg_id_t id, void *data, + void (*free_callback)(void *)); + +/** + * Indicate whether HG_Core_register_data() has been called and return + * associated data. + * + * \param hg_core_class [IN] pointer to HG core class + * \param id [IN] registered function ID + * + * \return Pointer to data or NULL + */ +HG_PUBLIC void *HG_Core_registered_data(hg_core_class_t *hg_core_class, hg_id_t id); + +/** + * Lookup an addr from a peer address/name. Addresses need to be + * freed by calling HG_Core_addr_free(). After completion, user callback is + * placed into a completion queue and can be triggered using HG_Core_trigger(). + * + * \param context [IN] pointer to context of execution + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param name [IN] lookup name + * \param op_id [OUT] pointer to returned operation ID (unused) + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_addr_lookup1(hg_core_context_t *context, hg_core_cb_t callback, void *arg, + const char *name, hg_core_op_id_t *op_id); + +/** + * Lookup an addr from a peer address/name. Addresses need to be + * freed by calling HG_Core_addr_free(). + * + * \param hg_core_class [IN] pointer to HG core class + * \param name [IN] lookup name + * \param addr [OUT] pointer to abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_addr_lookup2(hg_core_class_t *hg_core_class, const char *name, + hg_core_addr_t *addr); + +/** + * Free the addr from the list of peers. + * + * \param addr [IN] abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_addr_free(hg_core_addr_t addr); + +/** + * Hint that the address is no longer valid. This may happen if the peer is + * no longer responding. This can be used to force removal of the + * peer address from the list of the peers, before freeing it and reclaim + * resources. + * + * \param addr [IN] abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_addr_set_remove(hg_core_addr_t addr); + +/** + * Obtain the underlying NA address from an HG address. + * + * \param addr [IN] abstract address + * + * \return abstract NA addr or NA_ADDR_NULL if not a valid HG address + */ +static HG_INLINE na_addr_t HG_Core_addr_get_na(hg_core_addr_t addr); + +#ifdef NA_HAS_SM +/** + * Obtain the underlying NA SM address from an HG address. + * + * \param addr [IN] abstract address + * + * \return abstract NA addr or NA_ADDR_NULL if not a valid HG address + */ +static HG_INLINE na_addr_t HG_Core_addr_get_na_sm(hg_core_addr_t addr); +#endif + +/** + * Access self address. Address must be freed with HG_Core_addr_free(). + * + * \param hg_core_class [IN] pointer to HG core class + * \param addr [OUT] pointer to abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_addr_self(hg_core_class_t *hg_core_class, hg_core_addr_t *addr); + +/** + * Duplicate an existing HG abstract address. The duplicated address can be + * stored for later use and the origin address be freed safely. The duplicated + * address must be freed with HG_Core_addr_free(). + * + * \param addr [IN] abstract address + * \param new_addr [OUT] pointer to abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_addr_dup(hg_core_addr_t addr, hg_core_addr_t *new_addr); + +/** + * Compare two addresses. + * + * \param addr1 [IN] abstract address + * \param addr2 [IN] abstract address + * + * \return HG_TRUE if addresses are determined to be equal, HG_FALSE otherwise + */ +HG_PUBLIC hg_bool_t HG_Core_addr_cmp(hg_core_addr_t addr1, hg_core_addr_t addr2); + +/** + * Test whether address is self or not. + * + * \param addr [IN] pointer to abstract address + * + * \return HG_TRUE if address is self address, HG_FALSE otherwise + */ +static HG_INLINE hg_bool_t HG_Core_addr_is_self(hg_core_addr_t addr); + +/** + * Convert an addr to a string (returned string includes the terminating + * null byte '\0'). If buf is NULL, the address is not converted and only + * the required size of the buffer is returned. If the input value passed + * through buf_size is too small, HG_SIZE_ERROR is returned and the buf_size + * output is set to the minimum size required. + * + * \param buf [IN/OUT] pointer to destination buffer + * \param buf_size [IN/OUT] pointer to buffer size + * \param addr [IN] abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_addr_to_string(char *buf, hg_size_t *buf_size, hg_core_addr_t addr); + +/** + * Get size required to serialize address. + * + * \param addr [IN] abstract address + * \param flags [IN] optional flags + * + * \return Non-negative value + */ +HG_PUBLIC hg_size_t HG_Core_addr_get_serialize_size(hg_core_addr_t addr, unsigned long flags); + +/** + * Serialize address into a buffer. + * + * \param buf [IN/OUT] pointer to destination buffer + * \param buf_size [IN] pointer to buffer size + * \param flags [IN] optional flags + * \param addr [IN] abstract address + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_addr_serialize(void *buf, hg_size_t buf_size, unsigned long flags, + hg_core_addr_t addr); + +/** + * Deserialize address from a buffer. The returned address must be freed with + * HG_Core_addr_free(). + * + * \param hg_core_class [IN] pointer to HG core class + * \param addr [OUT] pointer to abstract address + * \param buf [IN] pointer to buffer used for deserialization + * \param buf_size [IN] buffer size + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_addr_deserialize(hg_core_class_t *hg_core_class, hg_core_addr_t *addr, + const void *buf, hg_size_t buf_size); + +/** + * Initiate a new HG RPC using the specified function ID and the local/remote + * target defined by addr. The HG handle created can be used to query input + * and output buffers, as well as issuing the RPC by using HG_Core_forward(). + * After completion the handle must be freed using HG_Core_destroy(). + * + * \param context [IN] pointer to HG core context + * \param addr [IN] target address + * \param id [IN] registered function ID + * \param handle [OUT] pointer to HG handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_create(hg_core_context_t *context, hg_core_addr_t addr, hg_id_t id, + hg_core_handle_t *handle); + +/** + * Destroy HG handle. Decrement reference count, resources associated to the + * handle are freed when the reference count is null. + * + * \param handle [IN] HG handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_destroy(hg_core_handle_t handle); + +/** + * Reset an existing HG handle to make it reusable for RPC forwarding. + * Both target address and RPC ID can be modified at this time. + * Operations on that handle must be completed in order to reset that handle + * safely. + * + * \param handle [IN] HG handle + * \param addr [IN] abstract network address of destination + * \param id [IN] registered function ID + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_reset(hg_core_handle_t handle, hg_core_addr_t addr, hg_id_t id); + +/** + * Increment ref count on handle. + * + * \param handle [IN] HG handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_ref_incr(hg_core_handle_t handle); + +/** + * Retrieve ref count from handle. + * + * \param handle [IN] HG handle + * + * \return Non-negative value or negative if the handle is not valid + */ +HG_PUBLIC hg_int32_t HG_Core_ref_get(hg_core_handle_t handle); + +/** + * Allows upper layers to attach data to an existing HG handle. + * The free_callback argument allows allocated resources to be released when + * the handle gets freed. + * + * \param handle [IN] HG handle + * \param data [IN] pointer to user data + * \param free_callback pointer to free function callback + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Core_set_data(hg_core_handle_t handle, void *data, + void (*free_callback)(void *)); + +/** + * Allows upper layers to retrieve data from an existing HG handle. + * Only valid if HG_Core_set_data() has been previously called. + * + * \param handle [IN] HG handle + * + * \return Pointer to user data or NULL if not set or any error has occurred + */ +static HG_INLINE void *HG_Core_get_data(hg_core_handle_t handle); + +/** + * Get info from handle. + * + * \remark Users must call HG_Core_addr_dup() to safely re-use the addr field. + * + * \param handle [IN] HG handle + * + * \return Pointer to info or NULL in case of failure + */ +static HG_INLINE const struct hg_core_info *HG_Core_get_info(hg_core_handle_t handle); + +/** + * Allows upper layers to retrieve cached RPC data from an existing HG handle. + * Only valid if HG_Core_register_data() has been previously called. + * + * \param handle [IN] HG handle + * + * \return Pointer to user data or NULL if not set or any error has occurred + */ +static HG_INLINE const void *HG_Core_get_rpc_data(hg_core_handle_t handle); + +/** + * Set target context ID that will receive and process the RPC request + * (ID is defined on target context creation, see HG_Core_context_create_id()). + * + * \param handle [IN] HG handle + * \param id [IN] user-defined target context ID + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Core_set_target_id(hg_core_handle_t handle, hg_uint8_t id); + +/** + * Get input buffer from handle that can be used for serializing/deserializing + * parameters. + * + * \param handle [IN] HG handle + * \param in_buf [OUT] pointer to input buffer + * \param in_buf_size [OUT] pointer to input buffer size + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Core_get_input(hg_core_handle_t handle, void **in_buf, + hg_size_t *in_buf_size); + +/** + * Get output buffer from handle that can be used for serializing/deserializing + * parameters. + * + * \param handle [IN] HG handle + * \param out_buf [OUT] pointer to output buffer + * \param out_buf_size [OUT] pointer to output buffer size + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t HG_Core_get_output(hg_core_handle_t handle, void **out_buf, + hg_size_t *out_buf_size); + +/** + * Forward a call using an existing HG handle. Input and output buffers can be + * queried from the handle to serialize/deserialize parameters. + * Additionally, a bulk handle can be passed if the size of the input is larger + * than the queried input buffer size. + * After completion, the handle must be freed using HG_Core_destroy(), the user + * callback is placed into a completion queue and can be triggered using + * HG_Core_trigger(). + * + * \param handle [IN] HG handle + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param payload_size [IN] size of payload to send + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_forward(hg_core_handle_t handle, hg_core_cb_t callback, void *arg, + hg_uint8_t flags, hg_size_t payload_size); + +/** + * Respond back to the origin. The output buffer, which can be used to encode + * the response, must first be queried using HG_Core_get_output(). + * After completion, the user callback is placed into a completion queue and + * can be triggered using HG_Core_trigger(). + * + * \param handle [IN] HG handle + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param payload_size [IN] size of payload to send + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_respond(hg_core_handle_t handle, hg_core_cb_t callback, void *arg, + hg_uint8_t flags, hg_size_t payload_size); + +/** + * Try to progress RPC execution for at most timeout until timeout is reached or + * any completion has occurred. + * Progress should not be considered as wait, in the sense that it cannot be + * assumed that completion of a specific operation will occur only when + * progress is called. + * + * \param context [IN] pointer to HG core context + * \param timeout [IN] timeout (in milliseconds) + * + * \return HG_SUCCESS if any completion has occurred / HG error code otherwise + */ +HG_PUBLIC hg_return_t HG_Core_progress(hg_core_context_t *context, unsigned int timeout); + +/** + * Execute at most max_count callbacks. If timeout is non-zero, wait up to + * timeout before returning. Function can return when at least one or more + * callbacks are triggered (at most max_count). + * + * \param context [IN] pointer to HG core context + * \param timeout [IN] timeout (in milliseconds) + * \param max_count [IN] maximum number of callbacks triggered + * \param actual_count [IN] actual number of callbacks triggered + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_trigger(hg_core_context_t *context, unsigned int timeout, + unsigned int max_count, unsigned int *actual_count); + +/** + * Cancel an ongoing operation. + * + * \param handle [IN] HG handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Core_cancel(hg_core_handle_t handle); + +/************************************/ +/* Local Type and Struct Definition */ +/************************************/ + +/* HG core class */ +struct hg_core_class { + na_class_t *na_class; /* NA class */ +#ifdef NA_HAS_SM + na_class_t *na_sm_class; /* NA SM class */ +#endif + void *data; /* User data */ + void (*data_free_callback)(void *); /* User data free callback */ +}; + +/* HG core context */ +struct hg_core_context { + struct hg_core_class *core_class; /* HG core class */ + na_context_t * na_context; /* NA context */ +#ifdef NA_HAS_SM + na_context_t *na_sm_context; /* NA SM context */ +#endif + void *data; /* User data */ + void (*data_free_callback)(void *); /* User data free callback */ + hg_uint8_t id; /* Context ID */ +}; + +/* HG core addr */ +struct hg_core_addr { + struct hg_core_class *core_class; /* HG core class */ + na_addr_t na_addr; /* NA address */ +#ifdef NA_HAS_SM + na_addr_t na_sm_addr; /* NA SM address */ +#endif + hg_bool_t is_self; /* Self address */ +}; + +/* HG core RPC registration info */ +struct hg_core_rpc_info { + hg_core_rpc_cb_t rpc_cb; /* RPC callback */ + void * data; /* User data */ + void (*free_callback)(void *); /* User data free callback */ +}; + +/* HG core handle */ +struct hg_core_handle { + struct hg_core_info info; /* HG info */ + struct hg_core_rpc_info *rpc_info; /* Associated RPC registration info */ + void * data; /* User data */ + void (*data_free_callback)(void *); /* User data free callback */ + void * in_buf; /* Input buffer */ + void * out_buf; /* Output buffer */ + na_size_t in_buf_size; /* Input buffer size */ + na_size_t out_buf_size; /* Output buffer size */ + na_size_t na_in_header_offset; /* Input NA header offset */ + na_size_t na_out_header_offset; /* Output NA header offset */ +}; + +/*---------------------------------------------------------------------------*/ +static HG_INLINE const char * +HG_Core_class_get_name(const hg_core_class_t *hg_core_class) +{ + return NA_Get_class_name(hg_core_class->na_class); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE const char * +HG_Core_class_get_protocol(const hg_core_class_t *hg_core_class) +{ + return NA_Get_class_protocol(hg_core_class->na_class); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_bool_t +HG_Core_class_is_listening(const hg_core_class_t *hg_core_class) +{ + return NA_Is_listening(hg_core_class->na_class); +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE na_class_t * + HG_Core_class_get_na(const hg_core_class_t *hg_core_class) +{ + return hg_core_class->na_class; +} + +/*---------------------------------------------------------------------------*/ +#ifdef NA_HAS_SM +static HG_INLINE na_class_t * + HG_Core_class_get_na_sm(const hg_core_class_t *hg_core_class) +{ + return hg_core_class->na_sm_class; +} +#endif + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_size_t +HG_Core_class_get_input_eager_size(const hg_core_class_t *hg_core_class) +{ + hg_size_t unexp = NA_Msg_get_max_unexpected_size(hg_core_class->na_class), + header = hg_core_header_request_get_size() + + NA_Msg_get_unexpected_header_size(hg_core_class->na_class); + + return (unexp > header) ? unexp - header : 0; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_size_t +HG_Core_class_get_output_eager_size(const hg_core_class_t *hg_core_class) +{ + hg_size_t exp = NA_Msg_get_max_expected_size(hg_core_class->na_class), + header = hg_core_header_response_get_size() + + NA_Msg_get_expected_header_size(hg_core_class->na_class); + + return (exp > header) ? exp - header : 0; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Core_class_set_data(hg_core_class_t *hg_core_class, void *data, void (*free_callback)(void *)) +{ + hg_core_class->data = data; + hg_core_class->data_free_callback = free_callback; + + return HG_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE void * +HG_Core_class_get_data(const hg_core_class_t *hg_core_class) +{ + return hg_core_class->data; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_core_class_t * + HG_Core_context_get_class(const hg_core_context_t *context) +{ + return context->core_class; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE na_context_t * + HG_Core_context_get_na(const hg_core_context_t *context) +{ + return context->na_context; +} + +/*---------------------------------------------------------------------------*/ +#ifdef NA_HAS_SM +static HG_INLINE na_context_t * + HG_Core_context_get_na_sm(const hg_core_context_t *context) +{ + return context->na_sm_context; +} +#endif + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_uint8_t +HG_Core_context_get_id(const hg_core_context_t *context) +{ + return context->id; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Core_context_set_data(hg_core_context_t *context, void *data, void (*free_callback)(void *)) +{ + context->data = data; + context->data_free_callback = free_callback; + + return HG_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE void * +HG_Core_context_get_data(const hg_core_context_t *context) +{ + return context->data; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE na_addr_t +HG_Core_addr_get_na(hg_core_addr_t addr) +{ + return addr->na_addr; +} + +/*---------------------------------------------------------------------------*/ +#ifdef NA_HAS_SM +static HG_INLINE na_addr_t +HG_Core_addr_get_na_sm(hg_core_addr_t addr) +{ + return addr->na_sm_addr; +} +#endif + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_bool_t +HG_Core_addr_is_self(hg_core_addr_t addr) +{ + return addr->is_self; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Core_set_data(hg_core_handle_t handle, void *data, void (*free_callback)(void *)) +{ + handle->data = data; + handle->data_free_callback = free_callback; + + return HG_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE void * +HG_Core_get_data(hg_core_handle_t handle) +{ + return handle->data; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE const struct hg_core_info * +HG_Core_get_info(hg_core_handle_t handle) +{ + return &handle->info; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE const void * +HG_Core_get_rpc_data(hg_core_handle_t handle) +{ + return (handle->rpc_info) ? handle->rpc_info->data : NULL; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Core_set_target_id(hg_core_handle_t handle, hg_uint8_t id) +{ + handle->info.context_id = id; + + return HG_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Core_get_input(hg_core_handle_t handle, void **in_buf, hg_size_t *in_buf_size) +{ + hg_size_t header_offset = hg_core_header_request_get_size() + handle->na_in_header_offset; + + /* Space must be left for request header */ + *in_buf = (char *)handle->in_buf + header_offset; + *in_buf_size = handle->in_buf_size - header_offset; + + return HG_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +HG_Core_get_output(hg_core_handle_t handle, void **out_buf, hg_size_t *out_buf_size) +{ + hg_size_t header_offset = hg_core_header_response_get_size() + handle->na_out_header_offset; + + /* Space must be left for response header */ + *out_buf = (char *)handle->out_buf + header_offset; + *out_buf_size = handle->out_buf_size - header_offset; + + return HG_SUCCESS; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_CORE_H */ diff --git a/src/mercury/include/mercury_core_header.h b/src/mercury/include/mercury_core_header.h new file mode 100644 index 00000000000..355adfa55b3 --- /dev/null +++ b/src/mercury/include/mercury_core_header.h @@ -0,0 +1,224 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_CORE_HEADER_H +#define MERCURY_CORE_HEADER_H + +#include "mercury_core_types.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +#if defined(__GNUC__) || defined(_WIN32) +#pragma pack(push, 1) +#else +#warning "Proc header struct padding may not be consistent across platforms." +#endif +#ifdef HG_HAS_CHECKSUMS +union hg_core_header_hash { + hg_uint16_t header; /* Header checksum (16-bits checksum) */ + hg_uint32_t pad; +}; +#endif + +struct hg_core_header_request { + hg_uint8_t hg; /* Mercury identifier */ + hg_uint8_t protocol; /* Version number */ + hg_uint64_t id; /* RPC request identifier */ + hg_uint8_t flags; /* Flags */ + hg_uint8_t cookie; /* Cookie */ + /* 96 bits here */ +#ifdef HG_HAS_CHECKSUMS + union hg_core_header_hash hash; /* Hash */ + /* 128 bits here */ +#endif +}; + +struct hg_core_header_response { + hg_int8_t ret_code; /* Return code */ + hg_uint8_t flags; /* Flags */ + hg_uint16_t cookie; /* Cookie */ + hg_uint64_t pad; /* Pad */ + /* 96 bits here */ +#ifdef HG_HAS_CHECKSUMS + union hg_core_header_hash hash; /* Hash */ + /* 128 bits here */ +#endif +}; +#if defined(__GNUC__) || defined(_WIN32) +#pragma pack(pop) +#endif + +/* Common header struct request/response */ +struct hg_core_header { + union { + struct hg_core_header_request request; + struct hg_core_header_response response; + } msg; +#ifdef HG_HAS_CHECKSUMS + void *checksum; /* Checksum of header */ +#endif +}; + +/* + * 0 HG_CORE_HEADER_SIZE size + * |______________|__________________________| + * | Header | Encoded Data | + * |______________|__________________________| + * + * + * Request: + * mercury byte / protocol version number / rpc id / flags / cookie / checksum + * + * Response: + * flags / return code / cookie / checksum + */ + +/*****************/ +/* Public Macros */ +/*****************/ + +/* Mercury identifier for packets sent */ +#define HG_CORE_IDENTIFIER (('H' << 1) | ('G')) /* 0xD7 */ + +/* Mercury protocol version number */ +#define HG_CORE_PROTOCOL_VERSION 0x05 + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +static HG_INLINE size_t hg_core_header_request_get_size(void); +static HG_INLINE size_t hg_core_header_response_get_size(void); + +/** + * Get size reserved for request header (separate user data stored in payload). + * + * \return Non-negative size value + */ +static HG_INLINE size_t +hg_core_header_request_get_size(void) +{ + return sizeof(struct hg_core_header_request); +} + +/** + * Get size reserved for response header (separate user data stored in payload). + * + * \return Non-negative size value + */ +static HG_INLINE size_t +hg_core_header_response_get_size(void) +{ + return sizeof(struct hg_core_header_response); +} + +/** + * Initialize RPC request header. + * + * \param hg_core_header [IN/OUT] pointer to request header structure + * + */ +HG_PRIVATE void hg_core_header_request_init(struct hg_core_header *hg_core_header); + +/** + * Initialize RPC response header. + * + * \param hg_core_header [IN/OUT] pointer to response header structure + * + */ +HG_PRIVATE void hg_core_header_response_init(struct hg_core_header *hg_core_header); + +/** + * Finalize RPC request header. + * + * \param hg_core_header [IN/OUT] pointer to request header structure + * + */ +HG_PRIVATE void hg_core_header_request_finalize(struct hg_core_header *hg_core_header); + +/** + * Finalize RPC response header. + * + * \param hg_core_header [IN/OUT] pointer to response header structure + * + */ +HG_PRIVATE void hg_core_header_response_finalize(struct hg_core_header *hg_core_header); + +/** + * Reset RPC request header. + * + * \param hg_core_header [IN/OUT] pointer to request header structure + * + */ +HG_PRIVATE void hg_core_header_request_reset(struct hg_core_header *hg_core_header); + +/** + * Reset RPC response header. + * + * \param hg_core_header [IN/OUT] pointer to response header structure + * + */ +HG_PRIVATE void hg_core_header_response_reset(struct hg_core_header *hg_core_header); + +/** + * Process private information for sending/receiving RPC request. + * + * \param op [IN] operation type: HG_ENCODE / HG_DECODE + * \param buf [IN/OUT] buffer + * \param buf_size [IN] buffer size + * \param hg_core_header [IN/OUT] pointer to header structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PRIVATE hg_return_t hg_core_header_request_proc(hg_proc_op_t op, void *buf, size_t buf_size, + struct hg_core_header *hg_core_header); + +/** + * Process private information for sending/receiving response. + * + * \param op [IN] operation type: HG_ENCODE / HG_DECODE + * \param buf [IN/OUT] buffer + * \param buf_size [IN] buffer size + * \param header [IN/OUT] pointer to header structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PRIVATE hg_return_t hg_core_header_response_proc(hg_proc_op_t op, void *buf, size_t buf_size, + struct hg_core_header *hg_core_header); + +/** + * Verify private information from request header. + * + * \param hg_core_header [IN] pointer to request header structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PRIVATE hg_return_t hg_core_header_request_verify(const struct hg_core_header *hg_core_header); + +/** + * Verify private information from response header. + * + * \param hg_core_header [IN] pointer to response header structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PRIVATE hg_return_t hg_core_header_response_verify(const struct hg_core_header *hg_core_header); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_CORE_HEADER_H */ diff --git a/src/mercury/include/mercury_core_types.h b/src/mercury/include/mercury_core_types.h new file mode 100644 index 00000000000..636ab756c70 --- /dev/null +++ b/src/mercury/include/mercury_core_types.h @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_CORE_TYPES_H +#define MERCURY_CORE_TYPES_H + +#include "mercury_config.h" +#include "na_types.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +typedef hg_uint64_t hg_size_t; /* Size */ +typedef hg_uint64_t hg_id_t; /* RPC ID */ + +/** + * HG init info struct + * NB. should be initialized using HG_INIT_INFO_INITIALIZER + */ +struct hg_init_info { + /* NA init info struct, see na_types.h for documentation */ + struct na_init_info na_init_info; + + /* Optional NA class that can be used for initializing an HG class. Using + * that option makes the init string passed to HG_Init() ignored. + * Default is: NULL */ + na_class_t *na_class; + + /* Controls the initial number of requests that are posted on context + * creation when the HG class is initialized with listen set to true. + * A value of zero is equivalent to using the internal default value. + * Default value is: 256 */ + hg_uint32_t request_post_init; + + /* Controls the number of requests that are incrementally posted when the + * initial number of requests is exhausted, a value of 0 means that only the + * initial number of requests will be re-used after they complete. Note that + * if the number of requests that are posted reaches 0, the underlying + * NA transport is responsible for queueing incoming requests. This value is + * used only if \request_post_init is set to a non-zero value. + * Default value is: 256 */ + hg_uint32_t request_post_incr; + + /* Controls whether the NA shared-memory interface should be automatically + * used if/when the RPC target address shares the same node as its origin. + * Default is: false */ + hg_bool_t auto_sm; + + /* Controls whether mercury should _NOT_ attempt to transfer small bulk data + * along with the RPC request. + * Default is: false */ + hg_bool_t no_bulk_eager; + + /* Disable internal loopback interface that enables forwarding of RPC + * requests to self addresses. Doing so will force traffic to be routed + * through NA. For performance reasons, users should be cautious when using + * that option. + * Default is: false */ + hg_bool_t no_loopback; + + /* (Debug) Print stats at exit. + * Default is: false */ + hg_bool_t stats; +}; + +/* Error return codes: + * Functions return 0 for success or corresponding return code */ +#define HG_RETURN_VALUES \ + X(HG_SUCCESS) /*!< operation succeeded */ \ + X(HG_PERMISSION) /*!< operation not permitted */ \ + X(HG_NOENTRY) /*!< no such file or directory */ \ + X(HG_INTERRUPT) /*!< operation interrupted */ \ + X(HG_AGAIN) /*!< operation must be retried */ \ + X(HG_NOMEM) /*!< out of memory */ \ + X(HG_ACCESS) /*!< permission denied */ \ + X(HG_FAULT) /*!< bad address */ \ + X(HG_BUSY) /*!< device or resource busy */ \ + X(HG_EXIST) /*!< entry already exists */ \ + X(HG_NODEV) /*!< no such device */ \ + X(HG_INVALID_ARG) /*!< invalid argument */ \ + X(HG_PROTOCOL_ERROR) /*!< protocol error */ \ + X(HG_OVERFLOW) /*!< value too large */ \ + X(HG_MSGSIZE) /*!< message size too long */ \ + X(HG_PROTONOSUPPORT) /*!< protocol not supported */ \ + X(HG_OPNOTSUPPORTED) /*!< operation not supported on endpoint */ \ + X(HG_ADDRINUSE) /*!< address already in use */ \ + X(HG_ADDRNOTAVAIL) /*!< cannot assign requested address */ \ + X(HG_HOSTUNREACH) /*!< cannot reach host during operation */ \ + X(HG_TIMEOUT) /*!< operation reached timeout */ \ + X(HG_CANCELED) /*!< operation canceled */ \ + X(HG_CHECKSUM_ERROR) /*!< checksum error */ \ + X(HG_NA_ERROR) /*!< generic NA error */ \ + X(HG_OTHER_ERROR) /*!< generic HG error */ \ + X(HG_RETURN_MAX) + +#define X(a) a, +typedef enum hg_return { HG_RETURN_VALUES } hg_return_t; +#undef X + +/* Compat return codes */ +#define HG_INVALID_PARAM HG_INVALID_ARG +#define HG_SIZE_ERROR HG_MSGSIZE +#define HG_NOMEM_ERROR HG_NOMEM +#define HG_NO_MATCH HG_NOENTRY + +/* Callback operation type */ +typedef enum hg_cb_type { + HG_CB_LOOKUP, /*!< lookup callback */ + HG_CB_FORWARD, /*!< forward callback */ + HG_CB_RESPOND, /*!< respond callback */ + HG_CB_BULK /*!< bulk transfer callback */ +} hg_cb_type_t; + +/* Input / output operation type */ +typedef enum { HG_UNDEF, HG_INPUT, HG_OUTPUT } hg_op_t; + +/** + * Encode/decode operations. + */ +typedef enum { + HG_ENCODE, /*!< causes the type to be encoded into the stream */ + HG_DECODE, /*!< causes the type to be extracted from the stream */ + HG_FREE /*!< can be used to release the space allocated by an HG_DECODE + request */ +} hg_proc_op_t; + +/** + * Encode/decode operation flags. + */ +#define HG_CORE_SM (1 << 0) + +/*****************/ +/* Public Macros */ +/*****************/ + +/* Max timeout */ +#define HG_MAX_IDLE_TIME (3600 * 1000) + +/* HG size max */ +#define HG_SIZE_MAX (UINT64_MAX) + +/* HG init info initializer */ +#define HG_INIT_INFO_INITIALIZER \ + { \ + NA_INIT_INFO_INITIALIZER, NULL, 0, 0, HG_FALSE, HG_FALSE, HG_FALSE, HG_FALSE \ + } + +#endif /* MERCURY_CORE_TYPES_H */ diff --git a/src/mercury/include/mercury_dlog.h b/src/mercury/include/mercury_dlog.h new file mode 100644 index 00000000000..557b7451797 --- /dev/null +++ b/src/mercury/include/mercury_dlog.h @@ -0,0 +1,273 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_DLOG_H +#define MERCURY_DLOG_H + +#include "mercury_util_config.h" + +#include "mercury_atomic.h" +#include "mercury_list.h" +#include "mercury_thread_mutex.h" +#include "mercury_time.h" + +#include + +/*****************/ +/* Public Macros */ +/*****************/ + +/* + * putting a magic number at the front of the dlog allows us to search + * for a dlog in a coredump file after a crash and examine its contents. + */ +#define HG_DLOG_MAGICLEN 16 /* bytes to reserve for magic# */ +#define HG_DLOG_STDMAGIC ">D.LO.G<" /* standard for first 8 bytes */ + +/* + * HG_DLOG_INITIALIZER: initializer for a dlog in a global variable. + * LESIZE is the number of entries in the LE array. use it like this: + * + * #define FOO_NENTS 128 + * struct hg_dlog_entry foo_le[FOO_NENTS]; + * struct hg_dlog foo_dlog = HG_DLOG_INITIALIZER("foo", foo_le, FOO_NENTS, 0); + */ +#define HG_DLOG_INITIALIZER(NAME, LE, LESIZE, LELOOP) \ + { \ + HG_DLOG_STDMAGIC NAME, HG_THREAD_MUTEX_INITIALIZER, HG_LIST_HEAD_INITIALIZER(cnts32), \ + HG_LIST_HEAD_INITIALIZER(cnts64), LE, LESIZE, LELOOP, 0, 0, 0, 0 \ + } + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/* + * hg_dlog_entry: an entry in the dlog + */ +struct hg_dlog_entry { + const char * file; /* file name */ + unsigned int line; /* line number */ + const char * func; /* function name */ + const char * msg; /* entry message (optional) */ + const void * data; /* user data (optional) */ + hg_time_t time; /* time added to log */ +}; + +/* + * hg_dlog_dcount32: 32-bit debug counter in the dlog + */ +struct hg_dlog_dcount32 { + const char * name; /* counter name (short) */ + const char * descr; /* description of counter */ + hg_atomic_int32_t c; /* the counter itself */ + HG_LIST_ENTRY(hg_dlog_dcount32) l; /* linkage */ +}; + +/* + * hg_dlog_dcount64: 64-bit debug counter in the dlog + */ +struct hg_dlog_dcount64 { + const char * name; /* counter name (short) */ + const char * descr; /* description of counter */ + hg_atomic_int64_t c; /* the counter itself */ + HG_LIST_ENTRY(hg_dlog_dcount64) l; /* linkage */ +}; + +/* + * hg_dlog: main structure + */ +struct hg_dlog { + char dlog_magic[HG_DLOG_MAGICLEN]; /* magic number + name */ + hg_thread_mutex_t dlock; /* lock for this data struct */ + + /* counter lists */ + HG_LIST_HEAD(hg_dlog_dcount32) cnts32; /* counter list */ + HG_LIST_HEAD(hg_dlog_dcount64) cnts64; /* counter list */ + + /* log */ + struct hg_dlog_entry *le; /* array of log entries */ + unsigned int lesize; /* size of le[] array */ + int leloop; /* circular buffer? */ + unsigned int lefree; /* next free entry in le[] */ + unsigned int leadds; /* #adds done if < lesize */ + int lestop; /* stop taking new logs */ + + int mallocd; /* allocated with malloc? */ +}; + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * malloc and return a new dlog + * + * \param name [IN] name of dlog (truncated past 8 bytes) + * \param lesize [IN] number of entries to allocate for log buffer + * \param leloop [IN] set to make log circular (can overwrite old + * entries) + * + * \return the new dlog or NULL on malloc error + */ +HG_UTIL_PUBLIC struct hg_dlog *hg_dlog_alloc(char *name, unsigned int lesize, int leloop); + +/** + * free anything we malloc'd on a dlog. assumes we have the final + * active reference to dlog and it won't be used anymore after this + * call (so no need to lock it). + * + * \param d [IN] the dlog to finalize + */ +HG_UTIL_PUBLIC void hg_dlog_free(struct hg_dlog *d); + +/** + * make a named atomic32 counter in a dlog and return a pointer to + * it. we use the dlock to ensure a counter under a given name only + * gets created once (makes it easy to share a counter across files). + * aborts if unable to alloc counter. use it like this: + * + * hg_atomic_int32_t *foo_count; + * static int init = 0; + * if (init == 0) { + * hg_dlog_mkcount32(dlog, &foo_count, "foocount", "counts of foo"); + * init = 1; + * } + * + * \param d [IN] dlog to create the counter in + * \param cptr [IN/OUT] pointer to use for counter (set to NULL to + * start) + * \param name [IN] short one word name for counter + * \param descr [IN] short description of counter + */ +HG_UTIL_PUBLIC void hg_dlog_mkcount32(struct hg_dlog *d, hg_atomic_int32_t **cptr, const char *name, + const char *descr); + +/** + * make a named atomic64 counter in a dlog and return a pointer to + * it. we use the dlock to ensure a counter under a given name only + * gets created once (makes it easy to share a counter across files). + * aborts if unable to alloc counter. use it like this: + * + * hg_atomic_int64_t *foo_count; + * static int init = 0; + * if (init == 0) { + * hg_dlog_mkcount64(dlog, &foo_count, "foocount", "counts of foo"); + * init = 1; + * } + * + * \param d [IN] dlog to create the counter in + * \param cptr [IN/OUT] pointer to use for counter (set to NULL to + * start) + * \param name [IN] short one word name for counter + * \param descr [IN] short description of counter + */ +HG_UTIL_PUBLIC void hg_dlog_mkcount64(struct hg_dlog *d, hg_atomic_int64_t **cptr, const char *name, + const char *descr); + +/** + * attempt to add a log record to a dlog. the id and msg should point + * to static strings that are valid throughout the life of the program + * (not something that is is on the stack). + * + * \param d [IN] the dlog to add the log record to + * \param file [IN] file entry + * \param line [IN] line entry + * \param func [IN] func entry + * \param msg [IN] log entry message (optional, NULL ok) + * \param data [IN] user data pointer for record (optional, NULL ok) + * + * \return 1 if added, 0 otherwise + */ +static HG_UTIL_INLINE unsigned int hg_dlog_addlog(struct hg_dlog *d, const char *file, unsigned int line, + const char *func, const char *msg, const void *data); + +/** + * set the value of stop for a dlog (to enable/disable logging) + * + * \param d [IN] dlog to set stop in + * \param stop [IN] value of stop to use (1=stop, 0=go) + */ +HG_UTIL_PUBLIC void hg_dlog_setlogstop(struct hg_dlog *d, int stop); + +/** + * reset the log. this does not change the counters (since users + * have direct access to the hg_atomic_int64_t's, we don't need + * an API to change them here). + * + * \param d [IN] dlog to reset + */ +HG_UTIL_PUBLIC void hg_dlog_resetlog(struct hg_dlog *d); + +/** + * dump dlog info to a stream. set trylock if you want to dump even + * if it is locked (e.g. you are crashing and you don't care about + * locking). + * + * \param d [IN] dlog to dump + * \param log_func [IN] log function to use (default printf) + * \param stream [IN] stream to use + * \param trylock [IN] just try to lock (warn if it fails) + */ +HG_UTIL_PUBLIC void hg_dlog_dump(struct hg_dlog *d, int (*log_func)(FILE *, const char *, ...), FILE *stream, + int trylock); + +/** + * dump dlog info to a file. set trylock if you want to dump even + * if it is locked (e.g. you are crashing and you don't care about + * locking). the output file is "base.log" or base-pid.log" depending + * on the value of addpid. + * + * \param d [IN] dlog to dump + * \param base [IN] output file basename + * \param addpid [IN] add pid to output filename + * \param trylock [IN] just try to lock (warn if it fails) + */ +HG_UTIL_PUBLIC void hg_dlog_dump_file(struct hg_dlog *d, const char *base, int addpid, int trylock); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE unsigned int +hg_dlog_addlog(struct hg_dlog *d, const char *file, unsigned int line, const char *func, const char *msg, + const void *data) +{ + unsigned int rv = 0; + unsigned int idx; + + hg_thread_mutex_lock(&d->dlock); + if (d->lestop) + goto done; + if (d->leloop == 0 && d->leadds >= d->lesize) + goto done; + idx = d->lefree; + d->lefree = (d->lefree + 1) % d->lesize; + if (d->leadds < d->lesize) + d->leadds++; + d->le[idx].file = file; + d->le[idx].line = line; + d->le[idx].func = func; + d->le[idx].msg = msg; + d->le[idx].data = data; + hg_time_get_current(&d->le[idx].time); + rv = 1; + +done: + hg_thread_mutex_unlock(&d->dlock); + return rv; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_DLOG_H */ diff --git a/src/mercury/include/mercury_event.h b/src/mercury/include/mercury_event.h new file mode 100644 index 00000000000..8be18a5c992 --- /dev/null +++ b/src/mercury/include/mercury_event.h @@ -0,0 +1,179 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_EVENT_H +#define MERCURY_EVENT_H + +#include "mercury_util_config.h" + +#ifdef _WIN32 + +#else +#include +#include +#include +#if defined(HG_UTIL_HAS_SYSEVENTFD_H) +#include +#ifndef HG_UTIL_HAS_EVENTFD_T +typedef uint64_t eventfd_t; +#endif +#elif defined(HG_UTIL_HAS_SYSEVENT_H) +#include +#define HG_EVENT_IDENT 42 /* User-defined ident */ +#endif +#endif + +/** + * Purpose: define an event object that can be used as an event + * wait/notify mechanism. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Create a new event object. + * + * \return file descriptor on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_event_create(void); + +/** + * Destroy an event object. + * + * \param fd [IN] event file descriptor + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_event_destroy(int fd); + +/** + * Notify for event. + * + * \param fd [IN] event file descriptor + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_event_set(int fd); + +/** + * Get event notification. + * + * \param fd [IN] event file descriptor + * \param notified [IN] boolean set to HG_UTIL_TRUE if event received + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_event_get(int fd, hg_util_bool_t *notified); + +/*---------------------------------------------------------------------------*/ +#if defined(_WIN32) +/* TODO */ +#elif defined(HG_UTIL_HAS_SYSEVENTFD_H) +#ifdef HG_UTIL_HAS_EVENTFD_T +static HG_UTIL_INLINE int +hg_event_set(int fd) +{ + return (eventfd_write(fd, 1) == 0) ? HG_UTIL_SUCCESS : HG_UTIL_FAIL; +} +#else +static HG_UTIL_INLINE int +hg_event_set(int fd) +{ + eventfd_t count = 1; + ssize_t s = write(fd, &count, sizeof(eventfd_t)); + + return (s == sizeof(eventfd_t)) ? HG_UTIL_SUCCESS : HG_UTIL_FAIL; +} +#endif +#elif defined(HG_UTIL_HAS_SYSEVENT_H) +static HG_UTIL_INLINE int +hg_event_set(int fd) +{ + struct kevent kev; + struct timespec timeout = {0, 0}; + int rc; + + EV_SET(&kev, HG_EVENT_IDENT, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL); + + /* Trigger user-defined event */ + rc = kevent(fd, &kev, 1, NULL, 0, &timeout); + + return (rc == -1) ? HG_UTIL_FAIL : HG_UTIL_SUCCESS; +} +#else +#error "Not supported on this platform." +#endif + +/*---------------------------------------------------------------------------*/ +#if defined(_WIN32) +#elif defined(HG_UTIL_HAS_SYSEVENTFD_H) +#ifdef HG_UTIL_HAS_EVENTFD_T +static HG_UTIL_INLINE int +hg_event_get(int fd, hg_util_bool_t *signaled) +{ + eventfd_t count = 0; + + if ((eventfd_read(fd, &count) == 0) && count) + *signaled = HG_UTIL_TRUE; + else { + if (errno == EAGAIN) + *signaled = HG_UTIL_FALSE; + else + return HG_UTIL_FAIL; + } + + return HG_UTIL_SUCCESS; +} +#else +static HG_UTIL_INLINE int +hg_event_get(int fd, hg_util_bool_t *signaled) +{ + eventfd_t count = 0; + ssize_t s = read(fd, &count, sizeof(eventfd_t)); + if ((s == sizeof(eventfd_t)) && count) + *signaled = HG_UTIL_TRUE; + else { + if (errno == EAGAIN) + *signaled = HG_UTIL_FALSE; + else + return HG_UTIL_FAIL; + } + + return HG_UTIL_SUCCESS; +} +#endif +#elif defined(HG_UTIL_HAS_SYSEVENT_H) +static HG_UTIL_INLINE int +hg_event_get(int fd, hg_util_bool_t *signaled) +{ + struct kevent kev; + int nfds; + struct timespec timeout = {0, 0}; + + /* Check user-defined event */ + nfds = kevent(fd, NULL, 0, &kev, 1, &timeout); + if (nfds == -1) + return HG_UTIL_FAIL; + + *signaled = ((nfds > 0) && (kev.ident == HG_EVENT_IDENT)) ? HG_UTIL_TRUE : HG_UTIL_FALSE; + + return HG_UTIL_SUCCESS; +} +#else +#error "Not supported on this platform." +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_EVENT_H */ diff --git a/src/mercury/include/mercury_hash_string.h b/src/mercury/include/mercury_hash_string.h new file mode 100644 index 00000000000..0b136ca8554 --- /dev/null +++ b/src/mercury/include/mercury_hash_string.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_HASH_STRING_H +#define MERCURY_HASH_STRING_H + +#include "mercury_util_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Hash function name for unique ID to register. + * + * \param string [IN] string name + * + * \return Non-negative ID that corresponds to string name + */ +static HG_UTIL_INLINE unsigned int +hg_hash_string(const char *string) +{ + /* This is the djb2 string hash function */ + + unsigned int result = 5381; + const unsigned char *p; + + p = (const unsigned char *)string; + + while (*p != '\0') { + result = (result << 5) + result + *p; + ++p; + } + return result; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_HASH_STRING_H */ diff --git a/src/mercury/include/mercury_hash_table.h b/src/mercury/include/mercury_hash_table.h new file mode 100644 index 00000000000..0063f020cdd --- /dev/null +++ b/src/mercury/include/mercury_hash_table.h @@ -0,0 +1,242 @@ +/* + +Copyright (c) 2005-2008, Simon Howard + +Permission to use, copy, modify, and/or distribute this software +for any purpose with or without fee is hereby granted, provided +that the above copyright notice and this permission notice appear +in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE +AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR +CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, +NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + */ + +/** + * \file mercury_hash_table.h + * + * \brief Hash table. + * + * A hash table stores a set of values which can be addressed by a + * key. Given the key, the corresponding value can be looked up + * quickly. + * + * To create a hash table, use \ref hg_hash_table_new. To destroy a + * hash table, use \ref hg_hash_table_free. + * + * To insert a value into a hash table, use \ref hg_hash_table_insert. + * + * To remove a value from a hash table, use \ref hg_hash_table_remove. + * + * To look up a value by its key, use \ref hg_hash_table_lookup. + * + * To iterate over all values in a hash table, use + * \ref hg_hash_table_iterate to initialize a \ref hg_hash_table_iter + * structure. Each value can then be read in turn using + * \ref hg_hash_table_iter_next and \ref hg_hash_table_iter_has_more. + */ + +#ifndef HG_HASH_TABLE_H +#define HG_HASH_TABLE_H + +#include "mercury_util_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * A hash table structure. + */ + +typedef struct hg_hash_table hg_hash_table_t; + +/** + * Structure used to iterate over a hash table. + */ + +typedef struct hg_hash_table_iter hg_hash_table_iter_t; + +/** + * Internal structure representing an entry in a hash table. + */ + +typedef struct hg_hash_table_entry hg_hash_table_entry_t; + +/** + * A key to look up a value in a \ref hg_hash_table_t. + */ + +typedef void *hg_hash_table_key_t; + +/** + * A value stored in a \ref hg_hash_table_t. + */ + +typedef void *hg_hash_table_value_t; + +/** + * Definition of a \ref hg_hash_table_iter. + */ + +struct hg_hash_table_iter { + hg_hash_table_t * hash_table; + hg_hash_table_entry_t *next_entry; + unsigned int next_chain; +}; + +/** + * A null \ref HashTableValue. + */ + +#define HG_HASH_TABLE_NULL ((void *)0) + +/** + * Hash function used to generate hash values for keys used in a hash + * table. + * + * \param value The value to generate a hash value for. + * \return The hash value. + */ + +typedef unsigned int (*hg_hash_table_hash_func_t)(hg_hash_table_key_t value); + +/** + * Function used to compare two keys for equality. + * + * \return Non-zero if the two keys are equal, zero if the keys are + * not equal. + */ + +typedef int (*hg_hash_table_equal_func_t)(hg_hash_table_key_t value1, hg_hash_table_key_t value2); + +/** + * Type of function used to free keys when entries are removed from a + * hash table. + */ + +typedef void (*hg_hash_table_key_free_func_t)(hg_hash_table_key_t value); + +/** + * Type of function used to free values when entries are removed from a + * hash table. + */ + +typedef void (*hg_hash_table_value_free_func_t)(hg_hash_table_value_t value); + +/** + * Create a new hash table. + * + * \param hash_func Function used to generate hash keys for the + * keys used in the table. + * \param equal_func Function used to test keys used in the table + * for equality. + * \return A new hash table structure, or NULL if it + * was not possible to allocate the new hash + * table. + */ +HG_UTIL_PUBLIC hg_hash_table_t *hg_hash_table_new(hg_hash_table_hash_func_t hash_func, + hg_hash_table_equal_func_t equal_func); + +/** + * Destroy a hash table. + * + * \param hash_table The hash table to destroy. + */ +HG_UTIL_PUBLIC void hg_hash_table_free(hg_hash_table_t *hash_table); + +/** + * Register functions used to free the key and value when an entry is + * removed from a hash table. + * + * \param hash_table The hash table. + * \param key_free_func Function used to free keys. + * \param value_free_func Function used to free values. + */ +HG_UTIL_PUBLIC void hg_hash_table_register_free_functions(hg_hash_table_t * hash_table, + hg_hash_table_key_free_func_t key_free_func, + hg_hash_table_value_free_func_t value_free_func); + +/** + * Insert a value into a hash table, overwriting any existing entry + * using the same key. + * + * \param hash_table The hash table. + * \param key The key for the new value. + * \param value The value to insert. + * \return Non-zero if the value was added successfully, + * or zero if it was not possible to allocate + * memory for the new entry. + */ +HG_UTIL_PUBLIC int hg_hash_table_insert(hg_hash_table_t *hash_table, hg_hash_table_key_t key, + hg_hash_table_value_t value); + +/** + * Look up a value in a hash table by key. + * + * \param hash_table The hash table. + * \param key The key of the value to look up. + * \return The value, or \ref HASH_TABLE_NULL if there + * is no value with that key in the hash table. + */ +HG_UTIL_PUBLIC hg_hash_table_value_t hg_hash_table_lookup(hg_hash_table_t * hash_table, + hg_hash_table_key_t key); + +/** + * Remove a value from a hash table. + * + * \param hash_table The hash table. + * \param key The key of the value to remove. + * \return Non-zero if a key was removed, or zero if the + * specified key was not found in the hash table. + */ +HG_UTIL_PUBLIC int hg_hash_table_remove(hg_hash_table_t *hash_table, hg_hash_table_key_t key); + +/** + * Retrieve the number of entries in a hash table. + * + * \param hash_table The hash table. + * \return The number of entries in the hash table. + */ +HG_UTIL_PUBLIC unsigned int hg_hash_table_num_entries(hg_hash_table_t *hash_table); + +/** + * Initialise a \ref HashTableIterator to iterate over a hash table. + * + * \param hash_table The hash table. + * \param iter Pointer to an iterator structure to + * initialise. + */ +HG_UTIL_PUBLIC void hg_hash_table_iterate(hg_hash_table_t *hash_table, hg_hash_table_iter_t *iter); + +/** + * Determine if there are more keys in the hash table to iterate over. + * + * \param iterator The hash table iterator. + * \return Zero if there are no more values to iterate + * over, non-zero if there are more values to + * iterate over. + */ +HG_UTIL_PUBLIC int hg_hash_table_iter_has_more(hg_hash_table_iter_t *iterator); + +/** + * Using a hash table iterator, retrieve the next key. + * + * \param iterator The hash table iterator. + * \return The next key from the hash table, or + * \ref HG_HASH_TABLE_NULL if there are no more + * keys to iterate over. + */ +HG_UTIL_PUBLIC hg_hash_table_value_t hg_hash_table_iter_next(hg_hash_table_iter_t *iterator); + +#ifdef __cplusplus +} +#endif + +#endif /* HG_HASH_TABLE_H */ diff --git a/src/mercury/include/mercury_header.h b/src/mercury/include/mercury_header.h new file mode 100644 index 00000000000..801ec69d806 --- /dev/null +++ b/src/mercury/include/mercury_header.h @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_HEADER_H +#define MERCURY_HEADER_H + +#include "mercury_core_types.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +#if defined(__GNUC__) || defined(_WIN32) +#pragma pack(push, 1) +#else +#warning "Proc header struct padding may not be consistent across platforms." +#endif +#ifdef HG_HAS_CHECKSUMS +struct hg_header_hash { + hg_uint32_t payload; /* Payload checksum (32-bits checksum) */ +}; +#endif + +struct hg_header_input { +#ifdef HG_HAS_CHECKSUMS + struct hg_header_hash hash; /* Hash */ +#else + hg_uint32_t pad; +#endif + /* 160 bits here */ +}; + +struct hg_header_output { +#ifdef HG_HAS_CHECKSUMS + struct hg_header_hash hash; /* Hash */ +#endif + hg_uint32_t pad; + /* 128/64 bits here */ +}; +#if defined(__GNUC__) || defined(_WIN32) +#pragma pack(pop) +#endif + +/* Common header struct input/output */ +struct hg_header { + union { + struct hg_header_input input; + struct hg_header_output output; + } msg; /* Header message */ + hg_op_t op; /* Header operation type */ +}; + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +static HG_INLINE size_t hg_header_get_size(hg_op_t op); + +/** + * Get size reserved for header (separate user data stored in payload). + * + * \return Non-negative size value + */ +static HG_INLINE size_t +hg_header_get_size(hg_op_t op) +{ + hg_size_t ret = 0; + + switch (op) { + case HG_INPUT: + ret = sizeof(struct hg_header_input); + break; + case HG_OUTPUT: + ret = sizeof(struct hg_header_output); + break; + default: + break; + } + + return ret; +} + +/** + * Initialize RPC header. + * + * \param hg_header [IN/OUT] pointer to header structure + * \param op [IN] HG operation type: HG_INPUT / HG_OUTPUT + */ +HG_PRIVATE void hg_header_init(struct hg_header *hg_header, hg_op_t op); + +/** + * Finalize RPC header. + * + * \param hg_header [IN/OUT] pointer to header structure + */ +HG_PRIVATE void hg_header_finalize(struct hg_header *hg_header); + +/** + * Reset RPC header. + * + * \param hg_header [IN/OUT] pointer to header structure + * \param op [IN] HG operation type: HG_INPUT / HG_OUTPUT + */ +HG_PRIVATE void hg_header_reset(struct hg_header *hg_header, hg_op_t op); + +/** + * Process private information for sending/receiving RPC. + * + * \param op [IN] operation type: HG_ENCODE / HG_DECODE + * \param buf [IN/OUT] buffer + * \param buf_size [IN] buffer size + * \param hg_header [IN/OUT] pointer to header structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PRIVATE hg_return_t hg_header_proc(hg_proc_op_t op, void *buf, size_t buf_size, + struct hg_header *hg_header); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_HEADER_H */ diff --git a/src/mercury/include/mercury_hl.h b/src/mercury/include/mercury_hl.h new file mode 100644 index 00000000000..c6d5b100f72 --- /dev/null +++ b/src/mercury/include/mercury_hl.h @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_HL_H +#define MERCURY_HL_H + +#include "mercury.h" +#include "mercury_bulk.h" +#include "mercury_request.h" + +/*****************/ +/* Public Macros */ +/*****************/ + +/** + * Define macros so that default classes/contexts can be easily renamed + * if we ever need to. Users should use macros and not global variables + * directly. + */ +#define HG_CLASS_DEFAULT hg_class_default_g +#define HG_CONTEXT_DEFAULT hg_context_default_g +#define HG_REQUEST_CLASS_DEFAULT hg_request_class_default_g + +#ifdef __cplusplus +extern "C" { +#endif + +/********************/ +/* Public Variables */ +/********************/ + +/* HG default */ +extern HG_PUBLIC hg_class_t *HG_CLASS_DEFAULT; +extern HG_PUBLIC hg_context_t *HG_CONTEXT_DEFAULT; +extern HG_PUBLIC hg_request_class_t *HG_REQUEST_CLASS_DEFAULT; + +/*********************/ +/* Public Prototypes */ +/*********************/ + +/** + * Initialize Mercury high-level layer and create default classes/contexts. + * If no info_string is passed, the HG HL layer will attempt to initialize + * NA by using the value contained in the environment variable called + * MERCURY_PORT_NAME. + * \remark HG_Hl_finalize() is registered with atexit() so that default + * classes/contexts are freed at process termination. + * + * \param na_info_string [IN] host address with port number (e.g., + * "tcp://localhost:3344" or + * "bmi+tcp://localhost:3344") + * \param na_listen [IN] listen for incoming connections + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Hl_init(const char *na_info_string, hg_bool_t na_listen); + +/** + * Initialize Mercury high-level layer with options provided by init_info. + * Must be finalized with HG_Hl_finalize(). + * \remark HG_Hl_finalize() is registered with atexit() so that default + * classes/contexts are freed at process termination. + * \remark HG_Hl_init_opt() may become HG_Hl_init() in the future. + * + * \param na_info_string [IN] host address with port number (e.g., + * "tcp://localhost:3344" or + * "bmi+tcp://localhost:3344") + * \param na_listen [IN] listen for incoming connections + * \param hg_init_info [IN] (Optional) HG init info, NULL if no info + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Hl_init_opt(const char *na_info_string, hg_bool_t na_listen, + const struct hg_init_info *hg_init_info); + +/** + * Finalize Mercury high-level layer. + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Hl_finalize(void); + +/** + * Lookup an address and wait for its completion. Address must be freed + * using HG_Addr_free(). + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Hl_addr_lookup_wait(hg_context_t *context, hg_request_class_t *request_class, + const char *name, hg_addr_t *addr, unsigned int timeout); + +/** + * Forward a call and wait for its completion. A HG handle must have been + * previously created. Output can be queried using HG_Get_output() and freed + * using HG_Free_output(). + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Hl_forward_wait(hg_request_class_t *request_class, hg_handle_t handle, + void *in_struct, unsigned int timeout); + +/** + * Initiate a bulk data transfer and wait for its completion. + * + * \param context [IN] pointer to HG context + * \param op [IN] transfer operation: + * - HG_BULK_PUSH + * - HG_BULK_PULL + * \param origin_addr [IN] abstract address of origin + * \param origin_handle [IN] abstract bulk handle + * \param origin_offset [IN] offset + * \param local_handle [IN] abstract bulk handle + * \param local_offset [IN] offset + * \param size [IN] size of data to be transferred + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t HG_Hl_bulk_transfer_wait(hg_context_t *context, hg_request_class_t *request_class, + hg_bulk_op_t op, hg_addr_t origin_addr, + hg_bulk_t origin_handle, hg_size_t origin_offset, + hg_bulk_t local_handle, hg_size_t local_offset, hg_size_t size, + unsigned int timeout); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_HL_H */ diff --git a/src/mercury/include/mercury_hl_macros.h b/src/mercury/include/mercury_hl_macros.h new file mode 100644 index 00000000000..6c9135b3d5b --- /dev/null +++ b/src/mercury/include/mercury_hl_macros.h @@ -0,0 +1,384 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_HL_MACROS_H +#define MERCURY_HL_MACROS_H + +#include "mercury_hl.h" +#include "mercury_macros.h" + +/** + * The purpose of these macros is to generate boilerplate code in order + * to send and execute HG RPC calls. + * Since these macros make use of the mercury high-level interface, applications + * using these macros must link to the mercury_hl library. + * HG_XXX macros are private macros / MERCURY_XXX are public macros. + * Macros defined in this file are: + * - MERCURY_GEN_LOG_MESSAGE + * - MERCURY_GEN_RPC_STUB + * - MERCURY_GEN_CALLBACK_STUB + */ + +/****************/ +/* Local Macros */ +/****************/ + +/* Return parameter with fixed name */ +#define HG_GEN_RET_PARAM(ret_type) ((ret_type)(ret)) + +/* Generate ((param) (datai)) element */ +#define HG_GEN_PARAM_NAME(r, prefix, i, param) ((param)(BOOST_PP_CAT(prefix, i))) + +/* Generate parameter names and ((type) (name)) sequence */ +#define HG_GEN_PARAM_NAME_SEQ(prefix, type_seq) BOOST_PP_SEQ_FOR_EACH_I(HG_GEN_PARAM_NAME, prefix, type_seq) + +/* Extract parameter (type name) element */ +#define HG_GEN_DECL_FUNC_PARAM(r, is_ref, param) \ + (HG_GEN_GET_TYPE(param) BOOST_PP_IF(is_ref, *, BOOST_PP_EMPTY()) HG_GEN_GET_NAME(param)) + +/* Extract (type name) sequence */ +#define HG_GEN_DECL_FUNC_PARAM_SEQ(is_ref, param_seq) \ + BOOST_PP_SEQ_FOR_EACH(HG_GEN_DECL_FUNC_PARAM, is_ref, param_seq) + +/* Extract function parameter declarations */ +#define HG_GEN_DECL_FUNC_PARAMS(with_input, in_params, extra_in_params, with_output, out_params, \ + extra_out_params) \ + BOOST_PP_SEQ_TO_TUPLE(BOOST_PP_IF( \ + BOOST_PP_OR(with_input, with_output), \ + HG_GEN_DECL_FUNC_PARAM_SEQ(0, in_params) HG_GEN_DECL_FUNC_PARAM_SEQ(0, extra_in_params) \ + HG_GEN_DECL_FUNC_PARAM_SEQ(1, out_params) HG_GEN_DECL_FUNC_PARAM_SEQ(1, extra_out_params), \ + (void))) + +/* Extract parameter (get_name(param)) element */ +#define HG_GEN_FUNC_PARAM(r, is_ref, param) (BOOST_PP_IF(is_ref, &, BOOST_PP_EMPTY()) HG_GEN_GET_NAME(param)) + +/* Extract (name) sequence */ +#define HG_GEN_FUNC_PARAM_SEQ(is_ref, param_seq) BOOST_PP_SEQ_FOR_EACH(HG_GEN_FUNC_PARAM, is_ref, param_seq) + +/* Extract function parameters */ +#define HG_GEN_FUNC_PARAMS(with_input, in_params, extra_in_params, with_output, out_params, \ + extra_out_params) \ + BOOST_PP_SEQ_TO_TUPLE( \ + BOOST_PP_IF(BOOST_PP_OR(with_input, with_output), \ + HG_GEN_FUNC_PARAM_SEQ(0, in_params) HG_GEN_FUNC_PARAM_SEQ(0, extra_in_params) \ + HG_GEN_FUNC_PARAM_SEQ(1, out_params) HG_GEN_FUNC_PARAM_SEQ(1, extra_out_params), \ + ())) + +/* Generate declaration of parameters --> type name; */ +#define HG_GEN_DECL_PARAMS(param_seq) BOOST_PP_SEQ_FOR_EACH(HG_GEN_STRUCT_FIELD, , param_seq) + +/* Assign param to struct field ( e.g., struct_name.param_1 = param_1; ) */ +#define HG_SET_STRUCT_PARAM(r, struct_name, param) \ + struct_name.HG_GEN_GET_NAME(param) = HG_GEN_GET_NAME(param); + +/* Assign param ((type) (name)) sequence to struct_name */ +#define HG_SET_STRUCT_PARAMS(struct_name, params) \ + BOOST_PP_SEQ_FOR_EACH(HG_SET_STRUCT_PARAM, struct_name, params) + +/* Assign struct_name field to param ( e.g., param_1 = struct_name.param_1; ) */ +#define HG_GET_STRUCT_PARAM(r, struct_name, param) \ + HG_GEN_GET_NAME(param) = struct_name.HG_GEN_GET_NAME(param); + +/* Assign struct_name fields to param ((type) (name)) sequence */ +#define HG_GET_STRUCT_PARAMS(struct_name, params) \ + BOOST_PP_SEQ_FOR_EACH(HG_GET_STRUCT_PARAM, struct_name, params) + +/* Assign struct_name field to out param ( e.g., *param_1 = struct_name.param_1; + * ) */ +#define HG_GET_OUT_STRUCT_PARAM(r, struct_name, param) \ + *HG_GEN_GET_NAME(param) = struct_name.HG_GEN_GET_NAME(param); + +/* Assign struct_name fields to out parame ((type) (name)) sequence */ +#define HG_GET_OUT_STRUCT_PARAMS(struct_name, params) \ + BOOST_PP_SEQ_FOR_EACH(HG_GET_OUT_STRUCT_PARAM, struct_name, params) + +/** + * Get/free output boilerplate code + */ + +/* Get output */ +#define HG_GET_OUTPUT(with_ret, ret_fail) \ + hg_ret = HG_Get_output(handle, &out_struct); \ + if (hg_ret != HG_SUCCESS) { \ + BOOST_PP_IF(with_ret, ret = ret_fail;, BOOST_PP_EMPTY()) \ + goto done; \ + } + +/* Free output */ +#define HG_FREE_OUTPUT(with_ret, ret_fail) \ + hg_ret = HG_Free_output(handle, &out_struct); \ + if (hg_ret != HG_SUCCESS) { \ + BOOST_PP_IF(with_ret, ret = ret_fail;, BOOST_PP_EMPTY()) \ + goto done; \ + } + +/** + * Bulk data support boilerplate code + */ + +/* Extra input parameters for bulk data */ +#define HG_BULK_CONST_BUF ((const void *)(bulk_buf)) +#define HG_BULK_BUF ((void *)(bulk_buf)) +#define HG_BULK_COUNT ((hg_uint64_t)(bulk_count)) +#define HG_BULK_EXTRA_IN_PARAM HG_BULK_BUF HG_BULK_COUNT + +/* Bulk handle parameter */ +#define HG_BULK_PARAM ((hg_bulk_t)(bulk_handle)) + +/* Local bulk handle parameter */ +#define HG_BULK_LOCAL_PARAM ((hg_bulk_t)(local_bulk_handle)) + +/* Create bulk handle */ +#define HG_BULK_REGISTER(handle, bulk_handle, with_ret, fail_ret, bulk_read) \ + hg_ret = HG_Bulk_create(HG_Get_info(handle)->hg_bulk_class, 1, \ + &HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_BUF)), \ + &HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_COUNT)), \ + BOOST_PP_IF(bulk_read, HG_BULK_READ_ONLY, HG_BULK_READWRITE), &bulk_handle); \ + if (hg_ret != HG_SUCCESS) { \ + BOOST_PP_IF(with_ret, ret = fail_ret;, BOOST_PP_EMPTY()) \ + goto done; \ + } + +/* Free bulk handle */ +#define HG_BULK_FREE(bulk_handle, with_ret, fail_ret) \ + hg_ret = HG_Bulk_free(bulk_handle); \ + if (hg_ret != HG_SUCCESS) { \ + BOOST_PP_IF(with_ret, ret = fail_ret;, BOOST_PP_EMPTY()) \ + goto done; \ + } + +/* Declare variables required for bulk transfers */ +#define HG_GEN_DECL_BULK_PARAMS HG_GEN_DECL_PARAMS(HG_BULK_PARAM HG_BULK_LOCAL_PARAM HG_BULK_EXTRA_IN_PARAM) + +/* Allocate memory and create local bulk handle */ +#define HG_BULK_LOCAL_ALLOCATE(origin_bulk_handle, local_bulk_handle) \ + HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_COUNT)) = HG_Bulk_get_size(origin_bulk_handle); \ + HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_BUF)) = \ + malloc(HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_COUNT))); \ + HG_Bulk_create(HG_Get_info(handle)->hg_bulk_class, 1, &HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_BUF)), \ + &HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_COUNT)), HG_BULK_READWRITE, \ + &local_bulk_handle); + +/* Free memory and local handle */ +#define HG_BULK_LOCAL_FREE(local_bulk_handle) \ + hg_ret = HG_Bulk_free(local_bulk_handle); \ + if (hg_ret != HG_SUCCESS) { \ + goto done; \ + } \ + free(HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_BUF))); + +/* Transfer bulk data using origin/local bulk handles (pull or push) */ +#define HG_BULK_TRANSFER(handle, origin_bulk_handle, local_bulk_handle, bulk_read) \ + hg_ret = HG_Hl_bulk_transfer_wait( \ + HG_Get_info(handle)->bulk_context, BOOST_PP_IF(bulk_read, HG_BULK_PULL, HG_BULK_PUSH), \ + HG_Get_info(handle)->addr, HG_Get_info(handle)->target_id, origin_bulk_handle, 0, local_bulk_handle, \ + 0, HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_COUNT))); \ + if (hg_ret != HG_SUCCESS) { \ + goto done; \ + } + +/*****************/ +/* Public Macros */ +/*****************/ + +/** + * Advanced BOOST macros: + * - MERCURY_GEN_RPC_STUB + * - MERCURY_GEN_CALLBACK_STUB + */ + +/* Custom function that applications can define for log purposes (none by + * default) */ +#ifndef MERCURY_GEN_LOG_MESSAGE +#define MERCURY_GEN_LOG_MESSAGE(x) +#endif + +/* Booleans for MERCURY_GEN_MACROS */ +#define MERCURY_GEN_FALSE 0 +#define MERCURY_GEN_TRUE 1 + +/* Generate RPC stub */ +#define MERCURY_GEN_RPC_STUB(gen_func_name, func_name, with_ret, ret_type_name, ret_fail, with_input, \ + in_struct_type_name, in_params, with_output, out_struct_type_name, out_params, \ + with_bulk, bulk_read) \ + BOOST_PP_IF(with_ret, ret_type_name, void) \ + gen_func_name HG_GEN_DECL_FUNC_PARAMS(with_input, in_params, \ + BOOST_PP_IF(with_bulk, HG_BULK_EXTRA_IN_PARAM, BOOST_PP_EMPTY()), \ + with_output, out_params, ) \ + { \ + BOOST_PP_IF(with_input, in_struct_type_name in_struct;, BOOST_PP_EMPTY()) \ + BOOST_PP_IF(BOOST_PP_OR(with_output, with_ret), out_struct_type_name out_struct;, BOOST_PP_EMPTY()) \ + BOOST_PP_IF(with_ret, ret_type_name ret;, BOOST_PP_EMPTY()) \ + hg_id_t id; \ + hg_handle_t handle; \ + BOOST_PP_IF(with_bulk, HG_GEN_DECL_PARAMS(HG_BULK_PARAM), BOOST_PP_EMPTY()) \ + hg_bool_t func_registered; \ + hg_return_t hg_ret; \ + \ + /* Init stack if not initialized */ \ + HG_Hl_init(NULL, 0); \ + \ + /* Check whether call has already been registered or not */ \ + HG_Registered_rpc(HG_CLASS_DEFAULT, BOOST_PP_STRINGIZE(func_name), &func_registered, &id); \ + if (!func_registered) { \ + id = MERCURY_REGISTER( \ + HG_CLASS_DEFAULT, BOOST_PP_STRINGIZE(func_name), \ + BOOST_PP_IF(with_input, in_struct_type_name, void), \ + BOOST_PP_IF(BOOST_PP_OR(with_output, with_ret), out_struct_type_name, void), NULL); \ + } \ + \ + /* Create HG handle */ \ + hg_ret = HG_Create(HG_CLASS_DEFAULT, HG_CONTEXT_DEFAULT, NA_ADDR_DEFAULT, id, &handle); \ + if (hg_ret != HG_SUCCESS) { \ + BOOST_PP_IF(with_ret, ret = ret_fail;, BOOST_PP_EMPTY()) \ + goto done; \ + } \ + \ + /* Create bulk handle */ \ + BOOST_PP_IF(with_bulk, \ + HG_BULK_REGISTER(handle, HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_PARAM)), with_ret, \ + ret_fail, bulk_read), \ + BOOST_PP_EMPTY()) \ + \ + /* Fill input structure */ \ + BOOST_PP_IF(with_input, \ + HG_SET_STRUCT_PARAMS(in_struct, \ + in_params BOOST_PP_IF(with_bulk, HG_BULK_PARAM, BOOST_PP_EMPTY())), \ + BOOST_PP_EMPTY()) \ + \ + /* Forward call to default target */ \ + hg_ret = HG_Hl_forward_wait(handle, BOOST_PP_IF(with_input, &in_struct, NULL)); \ + if (hg_ret != HG_SUCCESS) { \ + BOOST_PP_IF(with_ret, ret = ret_fail;, BOOST_PP_EMPTY()) \ + goto done; \ + } \ + \ + /* Free bulk handle */ \ + BOOST_PP_IF(with_bulk, \ + HG_BULK_FREE(HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_PARAM)), with_ret, ret_fail), \ + BOOST_PP_EMPTY()) \ + \ + /* Get output */ \ + BOOST_PP_IF(BOOST_PP_OR(with_output, with_ret), HG_GET_OUTPUT(with_ret, ret_fail), BOOST_PP_EMPTY()) \ + \ + /* Get output parameters */ \ + BOOST_PP_IF(with_ret, HG_GET_STRUCT_PARAMS(out_struct, ((ret_type)(ret))), BOOST_PP_EMPTY()) \ + BOOST_PP_IF(with_output, HG_GET_OUT_STRUCT_PARAMS(out_struct, out_params), BOOST_PP_EMPTY()) \ + \ + /* Free output */ \ + BOOST_PP_IF(BOOST_PP_OR(with_output, with_ret), HG_FREE_OUTPUT(with_ret, ret_fail), \ + BOOST_PP_EMPTY()) \ + \ + /* Destroy handle */ \ + hg_ret = HG_Destroy(handle); \ + if (hg_ret != HG_SUCCESS) { \ + BOOST_PP_IF(with_ret, ret = ret_fail;, BOOST_PP_EMPTY()) \ + goto done; \ + } \ + \ +done: \ + \ + return BOOST_PP_IF(with_ret, ret, BOOST_PP_EMPTY()); \ + } + +/* Generate callback stub */ +#define MERCURY_GEN_CALLBACK_STUB(gen_func_name, func_name, with_ret, ret_type, with_input, \ + in_struct_type_name, in_params, with_output, out_struct_type_name, \ + out_params, with_bulk, bulk_read, with_thread, thread_pool) \ + static BOOST_PP_IF(with_thread, HG_THREAD_RETURN_TYPE BOOST_PP_CAT(gen_func_name, _thread), \ + hg_return_t gen_func_name)(BOOST_PP_IF(with_thread, void *arg, hg_handle_t handle)) \ + { \ + BOOST_PP_IF(with_thread, hg_handle_t handle = (hg_handle_t)arg; \ + hg_thread_ret_t thread_ret = (hg_thread_ret_t)0;, BOOST_PP_EMPTY()) \ + hg_return_t hg_ret = HG_SUCCESS; \ + BOOST_PP_IF(with_input, in_struct_type_name in_struct;, BOOST_PP_EMPTY()) \ + BOOST_PP_IF(BOOST_PP_OR(with_output, with_ret), out_struct_type_name out_struct;, BOOST_PP_EMPTY()) \ + BOOST_PP_IF(with_input, HG_GEN_DECL_PARAMS(in_params), BOOST_PP_EMPTY()) \ + BOOST_PP_IF(with_output, HG_GEN_DECL_PARAMS(out_params), BOOST_PP_EMPTY()) \ + BOOST_PP_IF(with_ret, ret_type ret;, BOOST_PP_EMPTY()) \ + BOOST_PP_IF(with_bulk, HG_GEN_DECL_BULK_PARAMS, BOOST_PP_EMPTY()) \ + \ + /* Get input */ \ + BOOST_PP_IF( \ + with_input, hg_ret = HG_Get_input(handle, &in_struct); \ + if (hg_ret != HG_SUCCESS) { goto done; } \ + \ + /* Get parameters */ \ + HG_GET_STRUCT_PARAMS(in_struct, \ + in_params BOOST_PP_IF(with_bulk, HG_BULK_PARAM, BOOST_PP_EMPTY())), \ + BOOST_PP_EMPTY()) \ + \ + /* Allocate bulk handle */ \ + BOOST_PP_IF(with_bulk, \ + HG_BULK_LOCAL_ALLOCATE(HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_PARAM)), \ + HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_LOCAL_PARAM))), \ + BOOST_PP_EMPTY()) \ + BOOST_PP_IF(with_bulk, \ + BOOST_PP_IF(bulk_read, \ + HG_BULK_TRANSFER(handle, HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_PARAM)), \ + HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_LOCAL_PARAM)), \ + bulk_read), \ + BOOST_PP_EMPTY()), \ + BOOST_PP_EMPTY()) \ + \ + /* Call function */ \ + MERCURY_GEN_LOG_MESSAGE(BOOST_PP_STRINGIZE(func_name)); \ + BOOST_PP_IF(with_ret, ret =, BOOST_PP_EMPTY()) \ + func_name HG_GEN_FUNC_PARAMS(with_input, in_params, \ + BOOST_PP_IF(with_bulk, HG_BULK_EXTRA_IN_PARAM, BOOST_PP_EMPTY()), \ + with_output, out_params, ); \ + \ + BOOST_PP_IF(with_bulk, \ + BOOST_PP_IF(bulk_read, BOOST_PP_EMPTY(), \ + HG_BULK_TRANSFER(handle, HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_PARAM)), \ + HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_LOCAL_PARAM)), \ + bulk_read)), \ + BOOST_PP_EMPTY()) \ + \ + /* Free bulk handle */ \ + BOOST_PP_IF(with_bulk, HG_BULK_LOCAL_FREE(HG_GEN_GET_NAME(BOOST_PP_SEQ_HEAD(HG_BULK_LOCAL_PARAM))), \ + BOOST_PP_EMPTY()) \ + \ + /* Fill output structure */ \ + BOOST_PP_IF(with_ret, HG_SET_STRUCT_PARAMS(out_struct, ((ret_type)(ret))), BOOST_PP_EMPTY()) \ + BOOST_PP_IF(with_output, HG_SET_STRUCT_PARAMS(out_struct, out_params), BOOST_PP_EMPTY()) \ + \ + /* Respond back */ \ + hg_ret = HG_Respond(handle, NULL, NULL, \ + BOOST_PP_IF(BOOST_PP_OR(with_output, with_ret), &out_struct, NULL)); \ + if (hg_ret != HG_SUCCESS) { \ + goto done; \ + } \ + \ + /* Free input */ \ + BOOST_PP_IF( \ + with_input, hg_ret = HG_Free_input(handle, &in_struct); \ + if (hg_ret != HG_SUCCESS) { goto done; }, BOOST_PP_EMPTY()) \ + \ + /* Destroy handle */ \ + hg_ret = HG_Destroy(handle); \ + if (hg_ret != HG_SUCCESS) { \ + goto done; \ + } \ + \ +done: \ + \ + BOOST_PP_IF(with_thread, return thread_ret;, return hg_ret;) \ + } \ + BOOST_PP_IF( \ + with_thread, \ + static hg_return_t gen_func_name(hg_handle_t handle) { \ + hg_return_t ret = HG_SUCCESS; \ + hg_thread_pool_post(thread_pool, &BOOST_PP_CAT(gen_func_name, _thread), handle); \ + return ret; \ + }, \ + BOOST_PP_EMPTY()) + +#endif /* MERCURY_HL_MACROS_H */ diff --git a/src/mercury/include/mercury_list.h b/src/mercury/include/mercury_list.h new file mode 100644 index 00000000000..18ce93af8d3 --- /dev/null +++ b/src/mercury/include/mercury_list.h @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Code below is derived from sys/queue.h which follows the below notice: + * + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + */ + +#ifndef MERCURY_LIST_H +#define MERCURY_LIST_H + +#define HG_LIST_HEAD_INITIALIZER(name) \ + { \ + NULL \ + } + +#define HG_LIST_HEAD_INIT(struct_head_name, var_name) \ + struct struct_head_name var_name = HG_LIST_HEAD_INITIALIZER(var_name) + +#define HG_LIST_HEAD_DECL(struct_head_name, struct_entry_name) \ + struct struct_head_name { \ + struct struct_entry_name *head; \ + } + +#define HG_LIST_HEAD(struct_entry_name) \ + struct { \ + struct struct_entry_name *head; \ + } + +#define HG_LIST_ENTRY(struct_entry_name) \ + struct { \ + struct struct_entry_name * next; \ + struct struct_entry_name **prev; \ + } + +#define HG_LIST_INIT(head_ptr) \ + do { \ + (head_ptr)->head = NULL; \ + } while (/*CONSTCOND*/ 0) + +#define HG_LIST_IS_EMPTY(head_ptr) ((head_ptr)->head == NULL) + +#define HG_LIST_FIRST(head_ptr) ((head_ptr)->head) + +#define HG_LIST_NEXT(entry_ptr, entry_field_name) ((entry_ptr)->entry_field_name.next) + +#define HG_LIST_INSERT_AFTER(list_entry_ptr, entry_ptr, entry_field_name) \ + do { \ + if (((entry_ptr)->entry_field_name.next = (list_entry_ptr)->entry_field_name.next) != NULL) \ + (list_entry_ptr)->entry_field_name.next->entry_field_name.prev = \ + &(entry_ptr)->entry_field_name.next; \ + (list_entry_ptr)->entry_field_name.next = (entry_ptr); \ + (entry_ptr)->entry_field_name.prev = &(list_entry_ptr)->entry_field_name.next; \ + } while (/*CONSTCOND*/ 0) + +#define HG_LIST_INSERT_BEFORE(list_entry_ptr, entry_ptr, entry_field_name) \ + do { \ + (entry_ptr)->entry_field_name.prev = (list_entry_ptr)->entry_field_name.prev; \ + (entry_ptr)->entry_field_name.next = (list_entry_ptr); \ + *(list_entry_ptr)->entry_field_name.prev = (entry_ptr); \ + (list_entry_ptr)->entry_field_name.prev = &(entry_ptr)->entry_field_name.next; \ + } while (/*CONSTCOND*/ 0) + +#define HG_LIST_INSERT_HEAD(head_ptr, entry_ptr, entry_field_name) \ + do { \ + if (((entry_ptr)->entry_field_name.next = (head_ptr)->head) != NULL) \ + (head_ptr)->head->entry_field_name.prev = &(entry_ptr)->entry_field_name.next; \ + (head_ptr)->head = (entry_ptr); \ + (entry_ptr)->entry_field_name.prev = &(head_ptr)->head; \ + } while (/*CONSTCOND*/ 0) + +/* TODO would be nice to not have any condition */ +#define HG_LIST_REMOVE(entry_ptr, entry_field_name) \ + do { \ + if ((entry_ptr)->entry_field_name.next != NULL) \ + (entry_ptr)->entry_field_name.next->entry_field_name.prev = (entry_ptr)->entry_field_name.prev; \ + *(entry_ptr)->entry_field_name.prev = (entry_ptr)->entry_field_name.next; \ + } while (/*CONSTCOND*/ 0) + +#define HG_LIST_FOREACH(var, head_ptr, entry_field_name) \ + for ((var) = ((head_ptr)->head); (var); (var) = ((var)->entry_field_name.next)) + +#endif /* MERCURY_LIST_H */ diff --git a/src/mercury/include/mercury_log.h b/src/mercury/include/mercury_log.h new file mode 100644 index 00000000000..bb1b52fc209 --- /dev/null +++ b/src/mercury/include/mercury_log.h @@ -0,0 +1,399 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* + * Copyright (c) 2004, 2005, 2006, 2007 David Young. All rights reserved. + * + * Copyright (c) 2004 Urbana-Champaign Independent Media Center. + * All rights reserved. + * + * + * Portions of hlog are Copyright (c) David Young. The applicable copyright + * notice and licensing terms are reproduced here: + * + * Copyright (c) 2004, 2005, 2006, 2007 David Young. All rights reserved. + * + * This file contains code contributed by David Young. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY DAVID YOUNG ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DAVID + * YOUNG BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * ----------------------------------------------------------------------------- + * ----------------------------------------------------------------------------- + * + * Portions of hlog are Copyright (c) Urbana-Champaign Independent Media Center. + * The applicable copyright notice and licensing terms are reproduced here: + * + * Copyright (c) 2004 Urbana-Champaign Independent Media Center. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE URBANA-CHAMPAIGN INDEPENDENT + * MEDIA CENTER ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE URBANA-CHAMPAIGN INDEPENDENT + * MEDIA CENTER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MERCURY_LOG_H +#define MERCURY_LOG_H + +#include "mercury_dlog.h" +#include "mercury_queue.h" +#include "mercury_util_config.h" + +#include + +/*****************/ +/* Public Macros */ +/*****************/ + +/* For compatibility */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ < 199901L) +#if defined(__GNUC__) && (__GNUC__ >= 2) +#define __func__ __FUNCTION__ +#else +#define __func__ "" +#endif +#elif defined(_WIN32) +#define __func__ __FUNCTION__ +#endif + +/* Cat macro */ +#define HG_UTIL_CAT(x, y) x##y + +/* Stringify macro */ +#define HG_UTIL_STRINGIFY(x) #x + +/* Constructor (used to initialize log outlets) */ +#define HG_UTIL_CONSTRUCTOR __attribute__((constructor)) + +/* Available log levels, additional log levels should be added to that list by + * order of verbosity. Format is: + * - enum type + * - level name + * - default output + * + * error: print error level logs + * warning: print warning level logs + * min_debug: store minimal debug information and defer printing until error + * debug: print debug level logs + */ +#define HG_LOG_LEVELS \ + X(HG_LOG_LEVEL_NONE, "", NULL) /*!< no log */ \ + X(HG_LOG_LEVEL_ERROR, "error", &stderr) /*!< error log type */ \ + X(HG_LOG_LEVEL_WARNING, "warning", &stdout) /*!< warning log type */ \ + X(HG_LOG_LEVEL_MIN_DEBUG, "min_debug", &stdout) /*!< debug log type */ \ + X(HG_LOG_LEVEL_DEBUG, "debug", &stdout) /*!< debug log type */ \ + X(HG_LOG_LEVEL_MAX, "", NULL) + +/* HG_LOG_OUTLET: global variable name of log outlet. */ +#define HG_LOG_OUTLET(name) HG_UTIL_CAT(name, _log_outlet_g) + +/* HG_LOG_OUTLET_DECL: declare an outlet. */ +#define HG_LOG_OUTLET_DECL(name) struct hg_log_outlet HG_LOG_OUTLET(name) + +/* + * HG_LOG_OUTLET_INITIALIZER: initializer for a log in a global variable. + * (parent and debug_log are optional and can be set to NULL) + */ +#define HG_LOG_OUTLET_INITIALIZER(name, state, parent, debug_log) \ + { \ + HG_UTIL_STRINGIFY(name), state, HG_LOG_LEVEL_NONE, parent, debug_log, \ + { \ + NULL \ + } \ + } + +/* HG_LOG_OUTLET_SUBSYS_INITIALIZER: initializer for a sub-system log. */ +#define HG_LOG_OUTLET_SUBSYS_INITIALIZER(name, parent_name) \ + HG_LOG_OUTLET_INITIALIZER(name, HG_LOG_PASS, &HG_LOG_OUTLET(parent_name), NULL) + +/* HG_LOG_OUTLET_SUBSYS_STATE_INITIALIZER: initializer for a sub-system log with + * a defined state. */ +#define HG_LOG_OUTLET_SUBSYS_STATE_INITIALIZER(name, parent_name, state) \ + HG_LOG_OUTLET_INITIALIZER(name, state, &HG_LOG_OUTLET(parent_name), NULL) + +/* HG_LOG_SUBSYS_REGISTER: register a name */ +#define HG_LOG_SUBSYS_REGISTER(name) \ + static void HG_UTIL_CAT(hg_log_outlet_, name)(void) HG_UTIL_CONSTRUCTOR; \ + static void HG_UTIL_CAT(hg_log_outlet_, name)(void) \ + { \ + hg_log_outlet_register(&HG_LOG_OUTLET(name)); \ + } \ + /* Keep unused prototype to use semicolon at end of macro */ \ + void hg_log_outlet_##name##_unused(void) + +/* HG_LOG_SUBSYS_DECL_REGISTER: declare and register a log outlet. */ +#define HG_LOG_SUBSYS_DECL_REGISTER(name, parent_name) \ + struct hg_log_outlet HG_LOG_OUTLET(name) = HG_LOG_OUTLET_SUBSYS_INITIALIZER(name, parent_name); \ + HG_LOG_SUBSYS_REGISTER(name) + +/* HG_LOG_SUBSYS_DECL_STATE_REGISTER: declare and register a log outlet and + * enforce an init state. */ +#define HG_LOG_SUBSYS_DECL_STATE_REGISTER(name, parent_name, state) \ + struct hg_log_outlet HG_LOG_OUTLET(name) = \ + HG_LOG_OUTLET_SUBSYS_STATE_INITIALIZER(name, parent_name, state); \ + HG_LOG_SUBSYS_REGISTER(name) + +/* Log macro */ +#define HG_LOG_WRITE(name, log_level, ...) \ + do { \ + if (HG_LOG_OUTLET(name).level < log_level) \ + break; \ + hg_log_write(&HG_LOG_OUTLET(name), log_level, __FILE__, __LINE__, __func__, __VA_ARGS__); \ + } while (0) + +/* Log macro */ +#define HG_LOG_WRITE_DEBUG(name, debug_func, ...) \ + do { \ + if (HG_LOG_OUTLET(name).level < HG_LOG_LEVEL_MIN_DEBUG) \ + break; \ + if (HG_LOG_OUTLET(name).level >= HG_LOG_LEVEL_MIN_DEBUG && HG_LOG_OUTLET(name).debug_log) \ + hg_dlog_addlog(HG_LOG_OUTLET(name).debug_log, __FILE__, __LINE__, __func__, NULL, NULL); \ + if (HG_LOG_OUTLET(name).level == HG_LOG_LEVEL_DEBUG) { \ + hg_log_write(&HG_LOG_OUTLET(name), HG_LOG_LEVEL_DEBUG, __FILE__, __LINE__, __func__, \ + __VA_ARGS__); \ + debug_func; \ + } \ + } while (0) + +/** + * Additional macros for debug log support. + */ + +/* HG_LOG_DEBUG_DLOG: global variable name of debug log. */ +#define HG_LOG_DEBUG_DLOG(name) HG_UTIL_CAT(name, _dlog_g) + +/* HG_LOG_DEBUG_LE: global variable name of debug log entries. */ +#define HG_LOG_DEBUG_LE(name) HG_UTIL_CAT(name, _dlog_entries_g) + +/* HG_LOG_DEBUG_DECL_DLOG: declare new debug log. */ +#define HG_LOG_DEBUG_DECL_DLOG(name) struct hg_dlog HG_LOG_DEBUG_DLOG(name) + +/* HG_LOG_DEBUG_DECL_LE: declare array of debug log entries. */ +#define HG_LOG_DEBUG_DECL_LE(name, size) struct hg_dlog_entry HG_LOG_DEBUG_LE(name)[size] + +/* HG_LOG_DLOG_INITIALIZER: initializer for a debug log */ +#define HG_LOG_DLOG_INITIALIZER(name, size) \ + HG_DLOG_INITIALIZER(HG_UTIL_STRINGIFY(name), HG_LOG_DEBUG_LE(name), size, 1) + +/* HG_LOG_OUTLET_SUBSYS_DLOG_INITIALIZER: initializer for a sub-system with + * debug log. */ +#define HG_LOG_OUTLET_SUBSYS_DLOG_INITIALIZER(name, parent_name) \ + HG_LOG_OUTLET_INITIALIZER(name, HG_LOG_PASS, &HG_LOG_OUTLET(parent_name), &HG_LOG_DEBUG_DLOG(name)) + +/* HG_LOG_SUBSYS_DLOG_DECL_REGISTER: declare and register a log outlet with + * debug log. */ +#define HG_LOG_SUBSYS_DLOG_DECL_REGISTER(name, parent_name) \ + struct hg_log_outlet HG_LOG_OUTLET(name) = HG_LOG_OUTLET_SUBSYS_DLOG_INITIALIZER(name, parent_name); \ + HG_LOG_SUBSYS_REGISTER(name) + +/* HG_LOG_ADD_COUNTER32: add 32-bit debug log counter */ +#define HG_LOG_ADD_COUNTER32(name, counter_ptr, counter_name, counter_desc) \ + hg_dlog_mkcount32(HG_LOG_OUTLET(name).debug_log, counter_ptr, counter_name, counter_desc) + +/* HG_LOG_ADD_COUNTER64: add 64-bit debug log counter */ +#define HG_LOG_ADD_COUNTER64(name, counter_ptr, counter_name, counter_desc) \ + hg_dlog_mkcount64(HG_LOG_OUTLET(name)->debug_log, counter_ptr, counter_name, counter_desc) + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +#define X(a, b, c) a, +/* Log levels */ +enum hg_log_level { HG_LOG_LEVELS }; +#undef X + +/* Log states */ +enum hg_log_state { HG_LOG_PASS, HG_LOG_OFF, HG_LOG_ON }; + +/* Log outlet */ +struct hg_log_outlet { + const char * name; /* Name of outlet */ + enum hg_log_state state; /* Init state of outlet */ + enum hg_log_level level; /* Level of outlet */ + struct hg_log_outlet *parent; /* Parent of outlet */ + struct hg_dlog * debug_log; /* Debug log to use */ + HG_QUEUE_ENTRY(hg_log_outlet) entry; /* List entry */ +}; + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Set the global log level. + * + * \param log_level [IN] enum log level type + */ +HG_UTIL_PUBLIC void hg_log_set_level(enum hg_log_level log_level); + +/** + * Get the global log level. + * + * \return global log_level + */ +HG_UTIL_PUBLIC enum hg_log_level hg_log_get_level(void); + +/** + * Set the log subsystems from a string. Format is: subsys1,subsys2,... + * Subsys can also be forced to be disabled with "~", e.g., ~subsys1 + * + * \param log_level [IN] null terminated string + */ +HG_UTIL_PUBLIC void hg_log_set_subsys(const char *log_subsys); + +/** + * Get the log subsystems as a string. Format is similar to hg_log_set_subsys(). + * Buffer returned is static. + * + * \return string of enabled log subsystems + */ +HG_UTIL_PUBLIC const char *hg_log_get_subsys(void); + +/** + * Set a specific subsystem's log level. + */ +HG_UTIL_PUBLIC void hg_log_set_subsys_level(const char *subsys, enum hg_log_level log_level); + +/** + * Get the log level from a string. + * + * \param log_level [IN] null terminated string + * + * \return log type enum value + */ +HG_UTIL_PUBLIC enum hg_log_level hg_log_name_to_level(const char *log_level); + +/** + * Set the logging function. + * + * \param log_func [IN] pointer to function + */ +HG_UTIL_PUBLIC void hg_log_set_func(int (*log_func)(FILE *stream, const char *format, ...)); + +/** + * Set the stream for error output. + * + * \param stream [IN/OUT] pointer to stream + */ +HG_UTIL_PUBLIC void hg_log_set_stream_error(FILE *stream); + +/** + * Get the stream for error output. + * + * \return pointer to stream + */ +HG_UTIL_PUBLIC FILE *hg_log_get_stream_error(void); + +/** + * Set the stream for warning output. + * + * \param stream [IN/OUT] pointer to stream + */ +HG_UTIL_PUBLIC void hg_log_set_stream_warning(FILE *stream); + +/** + * Get the stream for warning output. + * + * \return pointer to stream + */ +HG_UTIL_PUBLIC FILE *hg_log_get_stream_warning(void); + +/** + * Set the stream for debug output. + * + * \param stream [IN/OUT] pointer to stream + */ +HG_UTIL_PUBLIC void hg_log_set_stream_debug(FILE *stream); + +/** + * Get the stream for debug output. + * + * \return pointer to stream + */ +HG_UTIL_PUBLIC FILE *hg_log_get_stream_debug(void); + +/** + * Register log outlet. + * + * \param outlet [IN] log outlet + */ +HG_UTIL_PUBLIC void hg_log_outlet_register(struct hg_log_outlet *outlet); + +/** + * Write log. + * + * \param outlet [IN] log outlet + * \param log_level [IN] log level + * \param file [IN] file name + * \param line [IN] line number + * \param func [IN] function name + * \param format [IN] string format + */ +HG_UTIL_PUBLIC void hg_log_write(struct hg_log_outlet *outlet, enum hg_log_level log_level, const char *file, + unsigned int line, const char *func, const char *format, ...) + HG_UTIL_PRINTF_LIKE(6, 7); + +/*********************/ +/* Public Variables */ +/*********************/ + +/* Top error outlet */ +extern HG_UTIL_PUBLIC HG_LOG_OUTLET_DECL(hg); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_LOG_H */ diff --git a/src/mercury/include/mercury_macros.h b/src/mercury/include/mercury_macros.h new file mode 100644 index 00000000000..5950679edaf --- /dev/null +++ b/src/mercury/include/mercury_macros.h @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_MACROS_H +#define MERCURY_MACROS_H + +#include "mercury.h" +#include "mercury_bulk.h" +#include "mercury_proc.h" +#include "mercury_proc_bulk.h" + +#ifdef HG_HAS_BOOST +#include + +/** + * The purpose of these macros is to facilitate generation of encoding/decoding + * procs as well as the registration of new routines to an existing HG class. + * HG_XXX macros are private macros / MERCURY_XXX are public macros. + * Macros defined in this file are: + * - MERCURY_REGISTER + * - MERCURY_GEN_PROC + * - MERCURY_GEN_STRUCT_PROC + */ + +/****************/ +/* Local Macros */ +/****************/ + +/* Get type / name */ +#define HG_GEN_GET_TYPE(field) BOOST_PP_SEQ_HEAD(field) +#define HG_GEN_GET_NAME(field) BOOST_PP_SEQ_CAT(BOOST_PP_SEQ_TAIL(field)) + +/* Get struct field */ +#define HG_GEN_STRUCT_FIELD(r, data, param) HG_GEN_GET_TYPE(param) HG_GEN_GET_NAME(param); + +/* Generate structure */ +#define HG_GEN_STRUCT(struct_type_name, fields) \ + typedef struct { \ + BOOST_PP_SEQ_FOR_EACH(HG_GEN_STRUCT_FIELD, , fields) \ + \ + } struct_type_name; + +/* Generate proc for struct field */ +#define HG_GEN_PROC(r, struct_name, field) \ + ret = BOOST_PP_CAT(hg_proc_, HG_GEN_GET_TYPE(field)(proc, &struct_name->HG_GEN_GET_NAME(field))); \ + if (unlikely(ret != HG_SUCCESS)) { \ + return ret; \ + } + +/* Generate proc for struct */ +#define HG_GEN_STRUCT_PROC(struct_type_name, fields) \ + static HG_INLINE hg_return_t BOOST_PP_CAT(hg_proc_, struct_type_name)(hg_proc_t proc, void *data) \ + { \ + hg_return_t ret = HG_SUCCESS; \ + struct_type_name *struct_data = (struct_type_name *)data; \ + \ + BOOST_PP_SEQ_FOR_EACH(HG_GEN_PROC, struct_data, fields) \ + \ + return ret; \ + } + +/*****************/ +/* Public Macros */ +/*****************/ + +/* Register func_name */ +#define MERCURY_REGISTER(hg_class, func_name, in_struct_type_name, out_struct_type_name, rpc_cb) \ + HG_Register_name(hg_class, func_name, BOOST_PP_CAT(hg_proc_, in_struct_type_name), \ + BOOST_PP_CAT(hg_proc_, out_struct_type_name), rpc_cb) + +/* Generate struct and corresponding struct proc */ +#define MERCURY_GEN_PROC(struct_type_name, fields) \ + HG_GEN_STRUCT(struct_type_name, fields) \ + HG_GEN_STRUCT_PROC(struct_type_name, fields) + +/* In the case of user defined structures / MERCURY_GEN_STRUCT_PROC can be + * used to generate the corresponding proc routine. + * E.g., if user defined struct: + * typedef struct { + * uint64_t cookie; + * } bla_handle_t; + * MERCURY_GEN_STRUCT_PROC( struct_type_name, field sequence ): + * MERCURY_GEN_STRUCT_PROC( bla_handle_t, ((uint64_t)(cookie)) ) + */ +#define MERCURY_GEN_STRUCT_PROC(struct_type_name, fields) HG_GEN_STRUCT_PROC(struct_type_name, fields) + +#else /* HG_HAS_BOOST */ + +/* Register func_name */ +#define MERCURY_REGISTER(hg_class, func_name, in_struct_type_name, out_struct_type_name, rpc_cb) \ + HG_Register_name(hg_class, func_name, hg_proc_##in_struct_type_name, hg_proc_##out_struct_type_name, \ + rpc_cb) + +#endif /* HG_HAS_BOOST */ + +/* If no input args or output args, a void type can be + * passed to MERCURY_REGISTER + */ +#define hg_proc_void NULL + +#endif /* MERCURY_MACROS_H */ diff --git a/src/mercury/include/mercury_mem.h b/src/mercury/include/mercury_mem.h new file mode 100644 index 00000000000..3c15c01f90d --- /dev/null +++ b/src/mercury/include/mercury_mem.h @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_MEM_H +#define MERCURY_MEM_H + +#include "mercury_util_config.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/*****************/ +/* Public Macros */ +/*****************/ + +#define HG_MEM_CACHE_LINE_SIZE 64 +#define HG_MEM_PAGE_SIZE 4096 + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Get system default page size. + * + * \return page size on success or negative on failure + */ +HG_UTIL_PUBLIC long hg_mem_get_page_size(void); + +/** + * Allocate size bytes and return a pointer to the allocated memory. + * The memory address will be a multiple of alignment, which must be a power of + * two, and size should be a multiple of alignment. + * + * \param alignment [IN] alignment size + * \param size [IN] total requested size + * + * \return a pointer to the allocated memory, or NULL in case of failure + */ +HG_UTIL_PUBLIC void *hg_mem_aligned_alloc(size_t alignment, size_t size); + +/** + * Free memory allocated from hg_aligned_alloc(). + * + * \param mem_ptr [IN] pointer to allocated memory + */ +HG_UTIL_PUBLIC void hg_mem_aligned_free(void *mem_ptr); + +/** + * Allocate a buffer with a `size`-bytes, `alignment`-aligned payload + * preceded by a `header_size` header, padding the allocation with up + * to `alignment - 1` bytes to ensure that the payload is properly aligned. + * + * If `alignment` is 0, do not try to align the payload. It's ok if + * `size` is 0, however, behavior is undefined if both `header_size` + * and `size` are 0. + * + * \param header_size [IN] size of header + * \param alignment [IN] alignment size + * \param size [IN] requested payload size + * + * \return a pointer to the payload or NULL on failure + */ +HG_UTIL_PUBLIC void *hg_mem_header_alloc(size_t header_size, size_t alignment, size_t size); + +/** + * Free the memory that was returned previously by a call to + * `hg_mem_header_alloc()`. + * + * \param header_size [IN] size of header + * \param alignment [IN] alignment size + * \param mem_ptr [IN] memory pointer + */ +HG_UTIL_PUBLIC void hg_mem_header_free(size_t header_size, size_t alignment, void *mem_ptr); + +/** + * Create/open a shared-memory mapped file of size \size with name \name. + * + * \param name [IN] name of mapped file + * \param size [IN] total requested size + * \param create [IN] create file if not existing + * + * \return a pointer to the mapped memory region, or NULL in case of failure + */ +HG_UTIL_PUBLIC void *hg_mem_shm_map(const char *name, size_t size, hg_util_bool_t create); + +/** + * Unmap a previously mapped region and close the file. + * + * \param name [IN] name of mapped file + * \param mem_ptr [IN] pointer to mapped memory region + * \param size [IN] size range of the mapped region + * + * \return non-negative on success, or negative in case of failure + */ +HG_UTIL_PUBLIC int hg_mem_shm_unmap(const char *name, void *mem_ptr, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_MEM_H */ diff --git a/src/mercury/include/mercury_mem_pool.h b/src/mercury/include/mercury_mem_pool.h new file mode 100644 index 00000000000..d2acfdd6e7f --- /dev/null +++ b/src/mercury/include/mercury_mem_pool.h @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_MEM_POOL_H +#define MERCURY_MEM_POOL_H + +#include "mercury_util_config.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/** + * Register memory block. + * + * \param buf [IN] pointer to buffer + * \param size [IN] buffer size + * \param handle [OUT] handle + * \param arg [IN/OUT] optional arguments + * + * \return HG_UTIL_SUCCESS if successful / error code otherwise + */ +typedef int (*hg_mem_pool_register_func_t)(const void *buf, size_t size, void **handle, void *arg); + +/** + * Deregister memory block. + * + * \param handle [IN/OUT] handle + * \param arg [IN/OUT] optional arguments + * + * \return HG_UTIL_SUCCESS if successful / error code otherwise + */ +typedef int (*hg_mem_pool_deregister_func_t)(void *handle, void *arg); + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Create a memory pool with \block_count of size \chunk_count x \chunk_size + * bytes. Optionally register and deregister memory for each block using + * \register_func and \deregister_func respectively. + * + * \param chunk_size [IN] size of chunks + * \param chunk_count [IN] number of chunks + * \param block_count [IN] number of blocks + * \param register_func [IN] pointer to register function + * \param deregister_func [IN] pointer to deregister function + * \param arg [IN/OUT] optional arguments passed to register functions + * + * \return HG_UTIL_SUCCESS if successful / error code otherwise + */ +HG_UTIL_PUBLIC struct hg_mem_pool *hg_mem_pool_create(size_t chunk_size, size_t chunk_count, + size_t block_count, + hg_mem_pool_register_func_t register_func, + hg_mem_pool_deregister_func_t deregister_func, + void * arg); + +/** + * Destroy a memory pool. + * + * \param hg_mem_pool [IN/OUT] pointer to memory pool + * + */ +HG_UTIL_PUBLIC void hg_mem_pool_destroy(struct hg_mem_pool *hg_mem_pool); + +/** + * Allocate \size bytes and optionally return a memory handle + * \mr_handle if registration functions were provided. + * + * \param hg_mem_pool [IN/OUT] pointer to memory pool + * \param size [IN] requested size + * \param mr_handle [OUT] pointer to memory handle + * + * \return pointer to memory block + */ +HG_UTIL_PUBLIC void *hg_mem_pool_alloc(struct hg_mem_pool *hg_mem_pool, size_t size, void **mr_handle); + +/** + * Release memory at address \mem_ptr. + * + * \param hg_mem_pool [IN/OUT] pointer to memory pool + * \param mem_ptr [IN] pointer to memory + * \param mr_handle [INT] pointer to memory handle + * + */ +HG_UTIL_PUBLIC void hg_mem_pool_free(struct hg_mem_pool *hg_mem_pool, void *mem_ptr, void *mr_handle); + +/** + * Retrieve chunk offset relative to the address used for registering + * the memory block it belongs to. + * + * \param hg_mem_pool [IN/OUT] pointer to memory pool + * \param mem_ptr [IN] pointer to memory + * \param mr_handle [INT] pointer to memory handle + * + * \return offset within registered block. + */ +HG_UTIL_PUBLIC size_t hg_mem_pool_chunk_offset(struct hg_mem_pool *hg_mem_pool, void *mem_ptr, + void *mr_handle); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_MEM_POOL_H */ diff --git a/src/mercury/include/mercury_poll.h b/src/mercury/include/mercury_poll.h new file mode 100644 index 00000000000..f4072a59041 --- /dev/null +++ b/src/mercury/include/mercury_poll.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_POLL_H +#define MERCURY_POLL_H + +#include "mercury_util_config.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +typedef struct hg_poll_set hg_poll_set_t; + +typedef union hg_poll_data { + void * ptr; + int fd; + hg_util_uint32_t u32; + hg_util_uint64_t u64; +} hg_poll_data_t; + +struct hg_poll_event { + hg_util_uint32_t events; /* Poll events */ + hg_poll_data_t data; /* User data variable */ +}; + +/*****************/ +/* Public Macros */ +/*****************/ + +/** + * Polling events. + */ +#define HG_POLLIN (1 << 0) /* There is data to read. */ +#define HG_POLLOUT (1 << 1) /* Writing now will not block. */ +#define HG_POLLERR (1 << 2) /* Error condition. */ +#define HG_POLLHUP (1 << 3) /* Hung up. */ +#define HG_POLLINTR (1 << 4) /* Interrupted. */ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Create a new poll set. + * + * \return Pointer to poll set or NULL in case of failure + */ +HG_UTIL_PUBLIC hg_poll_set_t *hg_poll_create(void); + +/** + * Destroy a poll set. + * + * \param poll_set [IN/OUT] pointer to poll set + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_poll_destroy(hg_poll_set_t *poll_set); + +/** + * Get a file descriptor from an existing poll set. + * + * \param poll_set [IN] pointer to poll set + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_poll_get_fd(hg_poll_set_t *poll_set); + +/** + * Add file descriptor to poll set. + * + * \param poll_set [IN] pointer to poll set + * \param fd [IN] file descriptor + * \param event [IN] pointer to event struct + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_poll_add(hg_poll_set_t *poll_set, int fd, struct hg_poll_event *event); + +/** + * Remove file descriptor from poll set. + * + * \param poll_set [IN] pointer to poll set + * \param fd [IN] file descriptor + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_poll_remove(hg_poll_set_t *poll_set, int fd); + +/** + * Wait on a poll set for timeout ms, and return at most max_events. + * + * \param poll_set [IN] pointer to poll set + * \param timeout [IN] timeout (in milliseconds) + * \param max_events [IN] max number of events + * \param events [IN/OUT] array of events to be returned + * \param actual_events [OUT] actual number of events returned + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_poll_wait(hg_poll_set_t *poll_set, unsigned int timeout, unsigned int max_events, + struct hg_poll_event events[], unsigned int *actual_events); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_POLL_H */ diff --git a/src/mercury/include/mercury_proc.h b/src/mercury/include/mercury_proc.h new file mode 100644 index 00000000000..f1426341117 --- /dev/null +++ b/src/mercury/include/mercury_proc.h @@ -0,0 +1,769 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_PROC_H +#define MERCURY_PROC_H + +#include "mercury_types.h" + +#include +#ifdef HG_HAS_XDR +#include +#include +#include +#ifdef __APPLE__ +#define xdr_int8_t xdr_char +#define xdr_uint8_t xdr_u_char +#define xdr_uint16_t xdr_u_int16_t +#define xdr_uint32_t xdr_u_int32_t +#define xdr_uint64_t xdr_u_int64_t +#endif +#define xdr_hg_int8_t xdr_int8_t +#define xdr_hg_uint8_t xdr_uint8_t +#define xdr_hg_int16_t xdr_int16_t +#define xdr_hg_uint16_t xdr_uint16_t +#define xdr_hg_int32_t xdr_int32_t +#define xdr_hg_uint32_t xdr_uint32_t +#define xdr_hg_int64_t xdr_int64_t +#define xdr_hg_uint64_t xdr_uint64_t +#endif + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/** + * Hash methods available for proc. + */ +typedef enum { HG_CRC16, HG_CRC32, HG_CRC64, HG_NOHASH } hg_proc_hash_t; + +/*****************/ +/* Public Macros */ +/*****************/ + +/* Encode/decode version number into uint32 */ +#define HG_GET_MAJOR(value) ((value >> 24) & 0xFF) +#define HG_GET_MINOR(value) ((value >> 16) & 0xFF) +#define HG_GET_PATCH(value) (value & 0xFFFF) +#define HG_VERSION ((HG_VERSION_MAJOR << 24) | (HG_VERSION_MINOR << 16) | HG_VERSION_PATCH) + +/** + * Operation flags. + */ +#define HG_PROC_SM (1 << 0) +#define HG_PROC_BULK_EAGER (1 << 1) + +/* Branch predictor hints */ +#ifndef _WIN32 +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#else +#ifndef likely +#define likely(x) (x) +#endif +#ifndef unlikely +#define unlikely(x) (x) +#endif +#endif + +/* Check whether size exceeds current proc size left */ +#ifdef HG_HAS_XDR +#define HG_PROC_CHECK_SIZE(proc, size, label, ret) \ + do { \ + if (unlikely(((struct hg_proc *)proc)->current_buf->size_left < size)) { \ + ret = HG_OVERFLOW; \ + goto label; \ + } \ + } while (0) +#else +#define HG_PROC_CHECK_SIZE(proc, size, label, ret) \ + do { \ + if (unlikely(((struct hg_proc *)proc)->current_buf->size_left < size)) { \ + ret = hg_proc_set_size(proc, hg_proc_get_size(proc) + size); \ + if (ret != HG_SUCCESS) \ + goto label; \ + } \ + } while (0) +#endif + +/* Encode type */ +#define HG_PROC_TYPE_ENCODE(proc, data, size) \ + memcpy(((struct hg_proc *)proc)->current_buf->buf_ptr, data, size) + +/* Decode type */ +#define HG_PROC_TYPE_DECODE(proc, data, size) \ + memcpy(data, ((struct hg_proc *)proc)->current_buf->buf_ptr, size) + +/* Update proc pointers */ +#define HG_PROC_UPDATE(proc, size) \ + do { \ + ((struct hg_proc *)proc)->current_buf->buf_ptr = \ + (char *)((struct hg_proc *)proc)->current_buf->buf_ptr + size; \ + ((struct hg_proc *)proc)->current_buf->size_left -= size; \ + } while (0) + +/* Update checksum */ +#ifdef HG_HAS_CHECKSUMS +#define HG_PROC_CHECKSUM_UPDATE(proc, data, size) hg_proc_checksum_update(proc, data, size) +#else +#define HG_PROC_CHECKSUM_UPDATE(proc, data, size) +#endif + +/* Base proc function */ +#ifdef HG_HAS_XDR +#define HG_PROC_TYPE(proc, type, data, label, ret) \ + do { \ + HG_PROC_CHECK_SIZE(proc, sizeof(type), label, ret); \ + \ + if (xdr_##type(hg_proc_get_xdr_ptr(proc), data) == 0) { \ + ret = HG_PROTOCOL_ERROR; \ + goto label; \ + } \ + \ + HG_PROC_UPDATE(proc, sizeof(type)); \ + HG_PROC_CHECKSUM_UPDATE(proc, data, sizeof(type)); \ + } while (0) +#else +#define HG_PROC_TYPE(proc, type, data, label, ret) \ + do { \ + /* Do nothing in HG_FREE for basic types */ \ + if (hg_proc_get_op(proc) == HG_FREE) \ + goto label; \ + \ + /* If not enough space allocate extra space if encoding or just */ \ + /* get extra buffer if decoding */ \ + HG_PROC_CHECK_SIZE(proc, sizeof(type), label, ret); \ + \ + /* Encode, decode type */ \ + if (hg_proc_get_op(proc) == HG_ENCODE) \ + HG_PROC_TYPE_ENCODE(proc, data, sizeof(type)); \ + else \ + HG_PROC_TYPE_DECODE(proc, data, sizeof(type)); \ + \ + /* Update proc pointers etc */ \ + HG_PROC_UPDATE(proc, sizeof(type)); \ + HG_PROC_CHECKSUM_UPDATE(proc, data, sizeof(type)); \ + } while (0) +#endif + +/* Base proc function */ +#ifdef HG_HAS_XDR +#define HG_PROC_BYTES(proc, data, size, label, ret) \ + do { \ + HG_PROC_CHECK_SIZE(proc, size, label, ret); \ + \ + if (xdr_bytes(hg_proc_get_xdr_ptr(proc), (char **)&data, (u_int *)&size, UINT_MAX) == 0) { \ + ret = HG_PROTOCOL_ERROR; \ + goto label; \ + } \ + \ + HG_PROC_UPDATE(proc, size); \ + HG_PROC_CHECKSUM_UPDATE(proc, data, size); \ + } while (0) +#else +#define HG_PROC_BYTES(proc, data, size, label, ret) \ + do { \ + /* Do nothing in HG_FREE for basic types */ \ + if (hg_proc_get_op(proc) == HG_FREE) \ + goto label; \ + \ + /* If not enough space allocate extra space if encoding or just */ \ + /* get extra buffer if decoding */ \ + HG_PROC_CHECK_SIZE(proc, size, label, ret); \ + \ + /* Encode, decode type */ \ + if (hg_proc_get_op(proc) == HG_ENCODE) \ + HG_PROC_TYPE_ENCODE(proc, data, size); \ + else \ + HG_PROC_TYPE_DECODE(proc, data, size); \ + \ + /* Update proc pointers etc */ \ + HG_PROC_UPDATE(proc, size); \ + HG_PROC_CHECKSUM_UPDATE(proc, data, size); \ + } while (0) +#endif + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Create a new encoding/decoding processor. + * + * \param hg_class [IN] HG class + * \param hash [IN] hash method used for computing checksum + * (if NULL, checksum is not computed) + * hash method: HG_CRC16, HG_CRC64, HG_NOHASH + * \param proc [OUT] pointer to abstract processor object + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_proc_create(hg_class_t *hg_class, hg_proc_hash_t hash, hg_proc_t *proc); + +/** + * Create a new encoding/decoding processor. + * + * \param hg_class [IN] HG class + * \param buf [IN] pointer to buffer that will be used for + * serialization/deserialization + * \param buf_size [IN] buffer size + * \param op [IN] operation type: HG_ENCODE / HG_DECODE / + * HG_FREE \param hash [IN] hash method used for computing + * checksum (if NULL, checksum is not computed) hash method: HG_CRC16, + * HG_CRC64, HG_NOHASH \param proc [OUT] pointer to abstract + * processor object + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_proc_create_set(hg_class_t *hg_class, void *buf, hg_size_t buf_size, hg_proc_op_t op, + hg_proc_hash_t hash, hg_proc_t *proc); + +/** + * Free the processor. + * + * \param proc [IN/OUT] abstract processor object + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_proc_free(hg_proc_t proc); + +/** + * Reset the processor. + * + * \param proc [IN/OUT] abstract processor object + * \param buf [IN] pointer to buffer that will be used for + * serialization/deserialization + * \param buf_size [IN] buffer size + * \param op [IN] operation type: HG_ENCODE / HG_DECODE / + * HG_FREE + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_proc_reset(hg_proc_t proc, void *buf, hg_size_t buf_size, hg_proc_op_t op); + +/** + * Get the HG class associated to the processor. + * + * \param proc [IN] abstract processor object + * + * \return HG class + */ +static HG_INLINE hg_class_t *hg_proc_get_class(hg_proc_t proc); + +/** + * Get the operation type associated to the processor. + * + * \param proc [IN] abstract processor object + * + * \return Operation type + */ +static HG_INLINE hg_proc_op_t hg_proc_get_op(hg_proc_t proc); + +/** + * Set flags to be associated with the processor. + * Flags are reset after a call to hg_proc_reset(). + * + * \param proc [IN] abstract processor object + * + * \return Non-negative flag value + */ +static HG_INLINE void hg_proc_set_flags(hg_proc_t proc, hg_uint8_t flags); + +/** + * Get the flags associated to the processor. + * + * \param proc [IN] abstract processor object + * + * \return Non-negative flag value + */ +static HG_INLINE hg_uint8_t hg_proc_get_flags(hg_proc_t proc); + +/** + * Get buffer size available for processing. + * + * \param proc [IN] abstract processor object + * + * \return Non-negative size value + */ +static HG_INLINE hg_size_t hg_proc_get_size(hg_proc_t proc); + +/** + * Get amount of buffer space that has actually been consumed + * + * \param proc [IN] abstract processor object + * + * \return Non-negative size value + */ +static HG_INLINE hg_size_t hg_proc_get_size_used(hg_proc_t proc); + +/** + * Request a new buffer size. This will modify the size of the buffer + * attached to the processor or create an extra processing buffer. + * + * \param proc [IN/OUT] abstract processor object + * \param buf_size [IN] buffer size + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_proc_set_size(hg_proc_t proc, hg_size_t buf_size); + +/** + * Get size left for processing. + * + * \param proc [IN] abstract processor object + * + * \return Non-negative size value + */ +static HG_INLINE hg_size_t hg_proc_get_size_left(hg_proc_t proc); + +/** + * Get pointer to current buffer. Will reserve data_size for manual + * encoding. + * + * \param proc [IN] abstract processor object + * \param data_size [IN] data size + * + * \return Buffer pointer + */ +HG_PUBLIC void *hg_proc_save_ptr(hg_proc_t proc, hg_size_t data_size); + +/** + * Restore pointer from current buffer. + * + * \param proc [IN] abstract processor object + * \param data [IN] pointer to data + * \param data_size [IN] data size + * + * \return Buffer pointer + */ +HG_PUBLIC hg_return_t hg_proc_restore_ptr(hg_proc_t proc, void *data, hg_size_t data_size); + +#ifdef HG_HAS_XDR +/** + * Get pointer to current XDR stream (for manual encoding). + * + * \param proc [IN] abstract processor object + * + * \return XDR stream pointer + */ +static HG_INLINE XDR *hg_proc_get_xdr_ptr(hg_proc_t proc); +#endif + +/** + * Get eventual extra buffer used by processor. + * + * \param proc [IN] abstract processor object + * + * \return Pointer to buffer or NULL if no extra buffer has been used + */ +static HG_INLINE void *hg_proc_get_extra_buf(hg_proc_t proc); + +/** + * Get eventual size of the extra buffer used by processor. + * + * \param proc [IN] abstract processor object + * + * \return Size of buffer or 0 if no extra buffer has been used + */ +static HG_INLINE hg_size_t hg_proc_get_extra_size(hg_proc_t proc); + +/** + * Set extra buffer to mine (if other calls mine, buffer is no longer freed + * after hg_proc_free()) + * + * \param proc [IN] abstract processor object + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_proc_set_extra_buf_is_mine(hg_proc_t proc, hg_bool_t mine); + +/** + * Flush the proc after data has been encoded or decoded and finalize + * internal checksum if checksum of data processed was initially requested. + * + * \param proc [IN] abstract processor object + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_proc_flush(hg_proc_t proc); + +#ifdef HG_HAS_CHECKSUMS +/** + * Retrieve internal proc checksum hash. + * \remark Must be used after hg_proc_flush() has been called so that the + * internally computed checksum is in a finalized state. + * + * \param proc [IN/OUT] abstract processor object + * \param hash [IN/OUT] pointer to hash + * \param hash_size [IN] hash size + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_proc_checksum_get(hg_proc_t proc, void *hash, hg_size_t hash_size); + +/** + * Verify that the hash passed matches the internal proc checksum. + * \remark Must be used after hg_proc_flush() has been called so that the + * internally computed checksum is in a finalized state. + * + * \param proc [IN/OUT] abstract processor object + * \param hash [IN] pointer to hash + * \param hash_size [IN] hash size + * + * \return HG_SUCCESS if matches or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_proc_checksum_verify(hg_proc_t proc, const void *hash, hg_size_t hash_size); +#endif + +/** + * Generic processing routine. + * + * \param proc [IN/OUT] abstract processor object + * \param data [IN/OUT] pointer to data + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t hg_proc_hg_int8_t(hg_proc_t proc, void *data); + +/** + * Generic processing routine. + * + * \param proc [IN/OUT] abstract processor object + * \param data [IN/OUT] pointer to data + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t hg_proc_hg_uint8_t(hg_proc_t proc, void *data); + +/** + * Generic processing routine. + * + * \param proc [IN/OUT] abstract processor object + * \param data [IN/OUT] pointer to data + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t hg_proc_hg_int16_t(hg_proc_t proc, void *data); + +/** + * Generic processing routine. + * + * \param proc [IN/OUT] abstract processor object + * \param data [IN/OUT] pointer to data + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t hg_proc_hg_uint16_t(hg_proc_t proc, void *data); + +/** + * Generic processing routine. + * + * \param proc [IN/OUT] abstract processor object + * \param data [IN/OUT] pointer to data + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t hg_proc_hg_int32_t(hg_proc_t proc, void *data); + +/** + * Generic processing routine. + * + * \param proc [IN/OUT] abstract processor object + * \param data [IN/OUT] pointer to data + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t hg_proc_hg_uint32_t(hg_proc_t proc, void *data); + +/** + * Generic processing routine. + * + * \param proc [IN/OUT] abstract processor object + * \param data [IN/OUT] pointer to data + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t hg_proc_hg_int64_t(hg_proc_t proc, void *data); + +/** + * Generic processing routine. + * + * \param proc [IN/OUT] abstract processor object + * \param data [IN/OUT] pointer to data + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t hg_proc_hg_uint64_t(hg_proc_t proc, void *data); + +/* Note: float types are not supported but can be built on top of the existing + * proc routines; encoding floats using XDR could modify checksum */ + +/** + * Generic processing routine for encoding stream of bytes. + * + * \param proc [IN/OUT] abstract processor object + * \param data [IN/OUT] pointer to data + * \param data_size [IN] data size + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t hg_proc_bytes(hg_proc_t proc, void *data, hg_size_t data_size); + +/** + * For convenience map stdint types to hg types + */ +#define hg_proc_int8_t hg_proc_hg_int8_t +#define hg_proc_uint8_t hg_proc_hg_uint8_t +#define hg_proc_int16_t hg_proc_hg_int16_t +#define hg_proc_uint16_t hg_proc_hg_uint16_t +#define hg_proc_int32_t hg_proc_hg_int32_t +#define hg_proc_uint32_t hg_proc_hg_uint32_t +#define hg_proc_int64_t hg_proc_hg_int64_t +#define hg_proc_uint64_t hg_proc_hg_uint64_t + +/* Map mercury common types */ +#define hg_proc_hg_bool_t hg_proc_hg_uint8_t +#define hg_proc_hg_ptr_t hg_proc_hg_uint64_t +#define hg_proc_hg_size_t hg_proc_hg_uint64_t +#define hg_proc_hg_id_t hg_proc_hg_uint32_t + +/* Map hg_proc_raw/hg_proc_memcpy to hg_proc_bytes */ +#define hg_proc_memcpy hg_proc_raw +#define hg_proc_raw hg_proc_bytes + +/* Update checksum */ +#ifdef HG_HAS_CHECKSUMS +HG_PUBLIC void hg_proc_checksum_update(hg_proc_t proc, void *data, hg_size_t data_size); +#endif + +/************************************/ +/* Local Type and Struct Definition */ +/************************************/ + +/* HG proc buf */ +struct hg_proc_buf { + void * buf; /* Pointer to allocated buffer */ + void * buf_ptr; /* Pointer to current position */ + hg_size_t size; /* Total buffer size */ + hg_size_t size_left; /* Available size for user */ + hg_bool_t is_mine; +#ifdef HG_HAS_XDR + XDR xdr; +#endif +}; + +/* HG proc */ +struct hg_proc { + struct hg_proc_buf proc_buf; + struct hg_proc_buf extra_buf; + hg_class_t * hg_class; /* HG class */ + struct hg_proc_buf *current_buf; +#ifdef HG_HAS_CHECKSUMS + void * checksum; /* Checksum */ + void * checksum_hash; /* Base checksum buf */ + size_t checksum_size; /* Checksum size */ +#endif + hg_proc_op_t op; + hg_uint8_t flags; +}; + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_class_t * + hg_proc_get_class(hg_proc_t proc) +{ + return ((struct hg_proc *)proc)->hg_class; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_proc_op_t +hg_proc_get_op(hg_proc_t proc) +{ + return ((struct hg_proc *)proc)->op; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE void +hg_proc_set_flags(hg_proc_t proc, hg_uint8_t flags) +{ + ((struct hg_proc *)proc)->flags = flags; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_uint8_t +hg_proc_get_flags(hg_proc_t proc) +{ + return ((struct hg_proc *)proc)->flags; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_size_t +hg_proc_get_size(hg_proc_t proc) +{ + return ((struct hg_proc *)proc)->proc_buf.size + ((struct hg_proc *)proc)->extra_buf.size; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_size_t +hg_proc_get_size_used(hg_proc_t proc) +{ + return ((struct hg_proc *)proc)->current_buf->size - ((struct hg_proc *)proc)->current_buf->size_left; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_size_t +hg_proc_get_size_left(hg_proc_t proc) +{ + return ((struct hg_proc *)proc)->current_buf->size_left; +} + +/*---------------------------------------------------------------------------*/ +#ifdef HG_HAS_XDR +static HG_INLINE XDR * + hg_proc_get_xdr_ptr(hg_proc_t proc) +{ + return &((struct hg_proc *)proc)->current_buf->xdr; +} +#endif + +/*---------------------------------------------------------------------------*/ +static HG_INLINE void * +hg_proc_get_extra_buf(hg_proc_t proc) +{ + return ((struct hg_proc *)proc)->extra_buf.buf; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_size_t +hg_proc_get_extra_size(hg_proc_t proc) +{ + return ((struct hg_proc *)proc)->extra_buf.size; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +hg_proc_hg_int8_t(hg_proc_t proc, void *data) +{ + hg_return_t ret = HG_SUCCESS; + + HG_PROC_TYPE(proc, hg_int8_t, data, done, ret); + +done: + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +hg_proc_hg_uint8_t(hg_proc_t proc, void *data) +{ + hg_return_t ret = HG_SUCCESS; + + HG_PROC_TYPE(proc, hg_uint8_t, data, done, ret); + +done: + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +hg_proc_hg_int16_t(hg_proc_t proc, void *data) +{ + hg_return_t ret = HG_SUCCESS; + + HG_PROC_TYPE(proc, hg_int16_t, data, done, ret); + +done: + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +hg_proc_hg_uint16_t(hg_proc_t proc, void *data) +{ + hg_return_t ret = HG_SUCCESS; + + HG_PROC_TYPE(proc, hg_uint16_t, data, done, ret); + +done: + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +hg_proc_hg_int32_t(hg_proc_t proc, void *data) +{ + hg_return_t ret = HG_SUCCESS; + + HG_PROC_TYPE(proc, hg_int32_t, data, done, ret); + +done: + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +hg_proc_hg_uint32_t(hg_proc_t proc, void *data) +{ + hg_return_t ret = HG_SUCCESS; + + HG_PROC_TYPE(proc, hg_uint32_t, data, done, ret); + +done: + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +hg_proc_hg_int64_t(hg_proc_t proc, void *data) +{ + hg_return_t ret = HG_SUCCESS; + + HG_PROC_TYPE(proc, hg_int64_t, data, done, ret); + +done: + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +hg_proc_hg_uint64_t(hg_proc_t proc, void *data) +{ + hg_return_t ret = HG_SUCCESS; + + HG_PROC_TYPE(proc, hg_uint64_t, data, done, ret); + +done: + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +hg_proc_bytes(hg_proc_t proc, void *data, hg_size_t data_size) +{ + hg_return_t ret = HG_SUCCESS; + + HG_PROC_BYTES(proc, data, data_size, done, ret); + +done: + return ret; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_PROC_H */ diff --git a/src/mercury/include/mercury_proc_bulk.h b/src/mercury/include/mercury_proc_bulk.h new file mode 100644 index 00000000000..f89face171b --- /dev/null +++ b/src/mercury/include/mercury_proc_bulk.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_PROC_BULK_H +#define MERCURY_PROC_BULK_H + +#include "mercury_proc.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Generic processing routine. + * + * \param proc [IN/OUT] abstract processor object + * \param handle [IN/OUT] pointer to bulk handle + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_proc_hg_bulk_t(hg_proc_t proc, void *data); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_PROC_BULK_H */ diff --git a/src/mercury/include/mercury_proc_string.h b/src/mercury/include/mercury_proc_string.h new file mode 100644 index 00000000000..764eb20167e --- /dev/null +++ b/src/mercury/include/mercury_proc_string.h @@ -0,0 +1,148 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_PROC_STRING_H +#define MERCURY_PROC_STRING_H + +#include "mercury_proc.h" +#include "mercury_string_object.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +typedef const char *hg_const_string_t; +typedef char * hg_string_t; + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Generic processing routine. + * + * \param proc [IN/OUT] abstract processor object + * \param data [IN/OUT] pointer to data + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t hg_proc_hg_const_string_t(hg_proc_t proc, void *data); + +/** + * Generic processing routine. + * + * \param proc [IN/OUT] abstract processor object + * \param data [IN/OUT] pointer to data + * + * \return HG_SUCCESS or corresponding HG error code + */ +static HG_INLINE hg_return_t hg_proc_hg_string_t(hg_proc_t proc, void *data); + +/** + * Generic processing routine. + * + * \param proc [IN/OUT] abstract processor object + * \param string [IN/OUT] pointer to string + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_proc_hg_string_object_t(hg_proc_t proc, void *string); + +/************************************/ +/* Local Type and Struct Definition */ +/************************************/ + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +hg_proc_hg_const_string_t(hg_proc_t proc, void *data) +{ + hg_string_object_t string; + hg_const_string_t *strdata = (hg_const_string_t *)data; + hg_return_t ret = HG_SUCCESS; + + switch (hg_proc_get_op(proc)) { + case HG_ENCODE: + hg_string_object_init_const_char(&string, *strdata, 0); + ret = hg_proc_hg_string_object_t(proc, &string); + if (ret != HG_SUCCESS) + goto done; + hg_string_object_free(&string); + break; + case HG_DECODE: + ret = hg_proc_hg_string_object_t(proc, &string); + if (ret != HG_SUCCESS) + goto done; + *strdata = hg_string_object_swap(&string, 0); + hg_string_object_free(&string); + break; + case HG_FREE: + hg_string_object_init_const_char(&string, *strdata, 1); + ret = hg_proc_hg_string_object_t(proc, &string); + if (ret != HG_SUCCESS) + goto done; + break; + default: + break; + } + +done: + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_INLINE hg_return_t +hg_proc_hg_string_t(hg_proc_t proc, void *data) +{ + hg_string_object_t string; + hg_string_t * strdata = (hg_string_t *)data; + hg_return_t ret = HG_SUCCESS; + + switch (hg_proc_get_op(proc)) { + case HG_ENCODE: + hg_string_object_init_char(&string, *strdata, 0); + ret = hg_proc_hg_string_object_t(proc, &string); + if (ret != HG_SUCCESS) + goto done; + hg_string_object_free(&string); + break; + case HG_DECODE: + ret = hg_proc_hg_string_object_t(proc, &string); + if (ret != HG_SUCCESS) + goto done; + *strdata = hg_string_object_swap(&string, 0); + hg_string_object_free(&string); + break; + case HG_FREE: + hg_string_object_init_char(&string, *strdata, 1); + ret = hg_proc_hg_string_object_t(proc, &string); + if (ret != HG_SUCCESS) + goto done; + break; + default: + break; + } + +done: + return ret; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_PROC_STRING_H */ diff --git a/src/mercury/include/mercury_queue.h b/src/mercury/include/mercury_queue.h new file mode 100644 index 00000000000..116a209beaa --- /dev/null +++ b/src/mercury/include/mercury_queue.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Code below is derived from sys/queue.h which follows the below notice: + * + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + */ + +#ifndef MERCURY_QUEUE_H +#define MERCURY_QUEUE_H + +#define HG_QUEUE_HEAD_INITIALIZER(name) \ + { \ + NULL, &(name).head \ + } + +#define HG_QUEUE_HEAD_INIT(struct_head_name, var_name) \ + struct struct_head_name var_name = HG_QUEUE_HEAD_INITIALIZER(var_name) + +#define HG_QUEUE_HEAD_DECL(struct_head_name, struct_entry_name) \ + struct struct_head_name { \ + struct struct_entry_name * head; \ + struct struct_entry_name **tail; \ + } + +#define HG_QUEUE_HEAD(struct_entry_name) \ + struct { \ + struct struct_entry_name * head; \ + struct struct_entry_name **tail; \ + } + +#define HG_QUEUE_ENTRY(struct_entry_name) \ + struct { \ + struct struct_entry_name *next; \ + } + +#define HG_QUEUE_INIT(head_ptr) \ + do { \ + (head_ptr)->head = NULL; \ + (head_ptr)->tail = &(head_ptr)->head; \ + } while (/*CONSTCOND*/ 0) + +#define HG_QUEUE_IS_EMPTY(head_ptr) ((head_ptr)->head == NULL) + +#define HG_QUEUE_FIRST(head_ptr) ((head_ptr)->head) + +#define HG_QUEUE_NEXT(entry_ptr, entry_field_name) ((entry_ptr)->entry_field_name.next) + +#define HG_QUEUE_PUSH_TAIL(head_ptr, entry_ptr, entry_field_name) \ + do { \ + (entry_ptr)->entry_field_name.next = NULL; \ + *(head_ptr)->tail = (entry_ptr); \ + (head_ptr)->tail = &(entry_ptr)->entry_field_name.next; \ + } while (/*CONSTCOND*/ 0) + +/* TODO would be nice to not have any condition */ +#define HG_QUEUE_POP_HEAD(head_ptr, entry_field_name) \ + do { \ + if ((head_ptr)->head && ((head_ptr)->head = (head_ptr)->head->entry_field_name.next) == NULL) \ + (head_ptr)->tail = &(head_ptr)->head; \ + } while (/*CONSTCOND*/ 0) + +#define HG_QUEUE_FOREACH(var, head_ptr, entry_field_name) \ + for ((var) = ((head_ptr)->head); (var); (var) = ((var)->entry_field_name.next)) + +/** + * Avoid using those for performance reasons or use mercury_list.h instead + */ + +#define HG_QUEUE_REMOVE(head_ptr, entry_ptr, type, entry_field_name) \ + do { \ + if ((head_ptr)->head == (entry_ptr)) { \ + HG_QUEUE_POP_HEAD((head_ptr), entry_field_name); \ + } \ + else { \ + struct type *curelm = (head_ptr)->head; \ + while (curelm->entry_field_name.next != (entry_ptr)) \ + curelm = curelm->entry_field_name.next; \ + if ((curelm->entry_field_name.next = curelm->entry_field_name.next->entry_field_name.next) == \ + NULL) \ + (head_ptr)->tail = &(curelm)->entry_field_name.next; \ + } \ + } while (/*CONSTCOND*/ 0) + +#endif /* MERCURY_QUEUE_H */ diff --git a/src/mercury/include/mercury_request.h b/src/mercury/include/mercury_request.h new file mode 100644 index 00000000000..4d7fdf8c551 --- /dev/null +++ b/src/mercury/include/mercury_request.h @@ -0,0 +1,204 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_REQUEST_H +#define MERCURY_REQUEST_H + +#include "mercury_util_config.h" + +#include "mercury_atomic.h" + +/** + * Purpose: define a request emulation library on top of the callback model + * that uses progress/trigger functions. Note that this library can not be + * safely used within RPCs in most cases - calling hg_request_wait causes + * deadlock when the caller function was triggered by HG_Trigger + * (or HG_Bulk_trigger). + */ + +typedef struct hg_request_class hg_request_class_t; /* Opaque request class */ +typedef struct hg_request hg_request_t; /* Opaque request object */ + +struct hg_request { + hg_request_class_t *request_class; + void * data; + hg_atomic_int32_t completed; +}; + +/** + * Progress callback, arg can be used to pass extra parameters required by + * underlying API. + * + * \param timeout [IN] timeout (in milliseconds) + * \param arg [IN] pointer to data passed to callback + * + * \return HG_UTIL_SUCCESS if any completion has occurred / error code otherwise + */ +typedef int (*hg_request_progress_func_t)(unsigned int timeout, void *arg); + +/** + * Trigger callback, arg can be used to pass extra parameters required by + * underlying API. + * + * \param timeout [IN] timeout (in milliseconds) + * \param flag [OUT] 1 if callback has been triggered, 0 otherwise + * \param arg [IN] pointer to data passed to callback + * + * \return HG_UTIL_SUCCESS or corresponding error code + */ +typedef int (*hg_request_trigger_func_t)(unsigned int timeout, unsigned int *flag, void *arg); + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the request class with the specific progress/trigger functions + * that will be called on hg_request_wait(). + * arg can be used to pass extra parameters required by underlying API. + * + * \param progress [IN] progress function + * \param trigger [IN] trigger function + * \param arg [IN] pointer to data passed to callback + * + * \return Pointer to request class or NULL in case of failure + */ +HG_UTIL_PUBLIC hg_request_class_t *hg_request_init(hg_request_progress_func_t progress, + hg_request_trigger_func_t trigger, void *arg); + +/** + * Finalize the request class. User args that were passed through + * hg_request_init() can be retrieved through the \a arg parameter. + * + * \param request_class [IN] pointer to request class + * \param arg [IN/OUT] pointer to init args + */ +HG_UTIL_PUBLIC void hg_request_finalize(hg_request_class_t *request_class, void **arg); + +/** + * Create a new request from a specified request class. The progress function + * explicitly makes progress and may insert the completed operation into a + * completion queue. The operation gets triggered after a call to the trigger + * function. + * + * \param request_class [IN] pointer to request class + * + * \return Pointer to request or NULL in case of failure + */ +HG_UTIL_PUBLIC hg_request_t *hg_request_create(hg_request_class_t *request_class); + +/** + * Destroy the request, freeing the resources. + * + * \param request [IN/OUT] pointer to request + */ +HG_UTIL_PUBLIC void hg_request_destroy(hg_request_t *request); + +/** + * Reset an existing request so that it can be safely re-used. + * + * \param request [IN/OUT] pointer to request + */ +static HG_UTIL_INLINE void hg_request_reset(hg_request_t *request); + +/** + * Mark the request as completed. (most likely called by a callback triggered + * after a call to trigger) + * + * \param request [IN/OUT] pointer to request + */ +static HG_UTIL_INLINE void hg_request_complete(hg_request_t *request); + +/** + * Wait timeout ms for the specified request to complete. + * + * \param request [IN/OUT] pointer to request + * \param timeout [IN] timeout (in milliseconds) + * \param flag [OUT] 1 if request has completed, 0 otherwise + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_request_wait(hg_request_t *request, unsigned int timeout, unsigned int *flag); + +/** + * Wait timeout ms for all the specified request to complete. + * + * \param count [IN] number of requests + * \param request [IN/OUT] arrays of requests + * \param timeout [IN] timeout (in milliseconds) + * \param flag [OUT] 1 if all requests have completed, 0 otherwise + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_request_waitall(int count, hg_request_t *request[], unsigned int timeout, + unsigned int *flag); + +/** + * Attach user data to a specified request. + * + * \param request [IN/OUT] pointer to request + * \param data [IN] pointer to data + */ +static HG_UTIL_INLINE void hg_request_set_data(hg_request_t *request, void *data); + +/** + * Get user data from a specified request. + * + * \param request [IN/OUT] pointer to request + * + * \return Pointer to data or NULL if nothing was attached by user + */ +static HG_UTIL_INLINE void *hg_request_get_data(hg_request_t *request); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_request_reset(hg_request_t *request) +{ + hg_atomic_set32(&request->completed, HG_UTIL_FALSE); +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_request_complete(hg_request_t *request) +{ + hg_atomic_set32(&request->completed, HG_UTIL_TRUE); +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_request_waitall(int count, hg_request_t *request[], unsigned int timeout, unsigned int *flag) +{ + int i; + + for (i = 0; i < count; i++) + hg_request_wait(request[i], timeout, flag); + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_request_set_data(hg_request_t *request, void *data) +{ + request->data = data; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void * +hg_request_get_data(hg_request_t *request) +{ + return request->data; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_REQUEST_H */ diff --git a/src/mercury/include/mercury_string_object.h b/src/mercury/include/mercury_string_object.h new file mode 100644 index 00000000000..5a7492a33ee --- /dev/null +++ b/src/mercury/include/mercury_string_object.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_STRING_OBJECT_H +#define MERCURY_STRING_OBJECT_H + +#include "mercury_types.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +typedef struct hg_string_object { + char * data; + hg_bool_t is_const; + hg_bool_t is_owned; +} hg_string_object_t; + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize a string object. + * + * \param string [OUT] pointer to string structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_string_object_init(hg_string_object_t *string); + +/** + * Initialize a string object from the string pointed to by s. + * + * \param string [OUT] pointer to string structure + * \param s [IN] pointer to string + * \param is_owned [IN] boolean + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_string_object_init_char(hg_string_object_t *string, char *s, hg_bool_t is_owned); + +/** + * Initialize a string object from the const string pointed to by s. + * + * \param string [OUT] pointer to string structure + * \param s [IN] pointer to string + * \param is_owned [IN] boolean + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_string_object_init_const_char(hg_string_object_t *string, const char *s, + hg_bool_t is_owned); + +/** + * Free a string object. + * + * \param string [IN/OUT] pointer to string structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_string_object_free(hg_string_object_t *string); + +/** + * Duplicate a string object. + * + * \param string [IN] pointer to string structure + * \param new_string [OUT] pointer to string structure + * + * \return HG_SUCCESS or corresponding HG error code + */ +HG_PUBLIC hg_return_t hg_string_object_dup(hg_string_object_t string, hg_string_object_t *new_string); + +/** + * Exchange the content of the string structure by the content of s. + * + * \param string [IN/OUT] pointer to string structure + * + * \return Pointer to string contained by string before the swap + */ +HG_PUBLIC char *hg_string_object_swap(hg_string_object_t *string, char *s); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_STRING_OBJECT_H */ diff --git a/src/mercury/include/mercury_thread.h b/src/mercury/include/mercury_thread.h new file mode 100644 index 00000000000..3317c41c287 --- /dev/null +++ b/src/mercury/include/mercury_thread.h @@ -0,0 +1,228 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_THREAD_H +#define MERCURY_THREAD_H + +#if !defined(_WIN32) && !defined(_GNU_SOURCE) +#define _GNU_SOURCE +#endif +#include "mercury_util_config.h" + +#ifdef _WIN32 +#include +typedef HANDLE hg_thread_t; +typedef LPTHREAD_START_ROUTINE hg_thread_func_t; +typedef DWORD hg_thread_ret_t; +#define HG_THREAD_RETURN_TYPE hg_thread_ret_t WINAPI +typedef DWORD hg_thread_key_t; +typedef DWORD_PTR hg_cpu_set_t; +#else +#include +typedef pthread_t hg_thread_t; +typedef void *(*hg_thread_func_t)(void *); +typedef void * hg_thread_ret_t; +#define HG_THREAD_RETURN_TYPE hg_thread_ret_t +typedef pthread_key_t hg_thread_key_t; +#ifdef __APPLE__ +/* Size definition for CPU sets. */ +#define HG_CPU_SETSIZE 1024 +#define HG_NCPUBITS (8 * sizeof(hg_cpu_mask_t)) +/* Type for array elements in 'cpu_set_t'. */ +typedef hg_util_uint64_t hg_cpu_mask_t; +typedef struct { + hg_cpu_mask_t bits[HG_CPU_SETSIZE / HG_NCPUBITS]; +} hg_cpu_set_t; +#else +typedef cpu_set_t hg_cpu_set_t; +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the thread. + * + * \param thread [IN/OUT] pointer to thread object + */ +HG_UTIL_PUBLIC void hg_thread_init(hg_thread_t *thread); + +/** + * Create a new thread for the given function. + * + * \param thread [IN/OUT] pointer to thread object + * \param f [IN] pointer to function + * \param data [IN] pointer to data than be passed to function f + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_create(hg_thread_t *thread, hg_thread_func_t f, void *data); + +/** + * Ends the calling thread. + * + * \param ret [IN] exit code for the thread + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC void hg_thread_exit(hg_thread_ret_t ret); + +/** + * Wait for thread completion. + * + * \param thread [IN] thread object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_join(hg_thread_t thread); + +/** + * Terminate the thread. + * + * \param thread [IN] thread object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_cancel(hg_thread_t thread); + +/** + * Yield the processor. + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_yield(void); + +/** + * Obtain handle of the calling thread. + * + * \return + */ +static HG_UTIL_INLINE hg_thread_t hg_thread_self(void); + +/** + * Compare thread IDs. + * + * \return Non-zero if equal, zero if not equal + */ +static HG_UTIL_INLINE int hg_thread_equal(hg_thread_t t1, hg_thread_t t2); + +/** + * Create a thread-specific data key visible to all threads in the process. + * + * \param key [OUT] pointer to thread key object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_key_create(hg_thread_key_t *key); + +/** + * Delete a thread-specific data key previously returned by + * hg_thread_key_create(). + * + * \param key [IN] thread key object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_key_delete(hg_thread_key_t key); + +/** + * Get value from specified key. + * + * \param key [IN] thread key object + * + * \return Pointer to data associated to the key + */ +static HG_UTIL_INLINE void *hg_thread_getspecific(hg_thread_key_t key); + +/** + * Set value to specified key. + * + * \param key [IN] thread key object + * \param value [IN] pointer to data that will be associated + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_setspecific(hg_thread_key_t key, const void *value); + +/** + * Get affinity mask. + * + * \param thread [IN] thread object + * \param cpu_mask [IN/OUT] cpu mask + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_getaffinity(hg_thread_t thread, hg_cpu_set_t *cpu_mask); + +/** + * Set affinity mask. + * + * \param thread [IN] thread object + * \param cpu_mask [IN] cpu mask + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_setaffinity(hg_thread_t thread, const hg_cpu_set_t *cpu_mask); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_thread_t +hg_thread_self(void) +{ +#ifdef _WIN32 + return GetCurrentThread(); +#else + return pthread_self(); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_equal(hg_thread_t t1, hg_thread_t t2) +{ +#ifdef _WIN32 + return GetThreadId(t1) == GetThreadId(t2); +#else + return pthread_equal(t1, t2); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void * +hg_thread_getspecific(hg_thread_key_t key) +{ +#ifdef _WIN32 + return TlsGetValue(key); +#else + return pthread_getspecific(key); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_setspecific(hg_thread_key_t key, const void *value) +{ +#ifdef _WIN32 + if (!TlsSetValue(key, (LPVOID)value)) + return HG_UTIL_FAIL; +#else + if (pthread_setspecific(key, value)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_THREAD_H */ diff --git a/src/mercury/include/mercury_thread_annotation.h b/src/mercury/include/mercury_thread_annotation.h new file mode 100644 index 00000000000..f8613a4d72b --- /dev/null +++ b/src/mercury/include/mercury_thread_annotation.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_THREAD_ANNOTATION_H +#define MERCURY_THREAD_ANNOTATION_H + +/* Enable thread safety attributes only with clang. + * The attributes can be safely erased when compiling with other compilers. */ +#if defined(__clang__) && (__clang_major__ > 3) +#define HG_THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x)) +#else +#define HG_THREAD_ANNOTATION_ATTRIBUTE__(x) // no-op +#endif + +#define HG_LOCK_CAPABILITY(x) HG_THREAD_ANNOTATION_ATTRIBUTE__(capability(x)) + +#define HG_LOCK_ACQUIRE(...) HG_THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__)) + +#define HG_LOCK_ACQUIRE_SHARED(...) HG_THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__)) + +#define HG_LOCK_RELEASE(...) HG_THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__)) + +#define HG_LOCK_RELEASE_SHARED(...) HG_THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__)) + +#define HG_LOCK_TRY_ACQUIRE(...) HG_THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__)) + +#define HG_LOCK_TRY_ACQUIRE_SHARED(...) \ + HG_THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__)) + +#define HG_LOCK_NO_THREAD_SAFETY_ANALYSIS HG_THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis) + +#endif /* MERCURY_THREAD_ANNOTATION_H */ diff --git a/src/mercury/include/mercury_thread_condition.h b/src/mercury/include/mercury_thread_condition.h new file mode 100644 index 00000000000..c1a3d61dc0b --- /dev/null +++ b/src/mercury/include/mercury_thread_condition.h @@ -0,0 +1,176 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_THREAD_CONDITION_H +#define MERCURY_THREAD_CONDITION_H + +#include "mercury_thread_mutex.h" + +#ifdef _WIN32 +typedef CONDITION_VARIABLE hg_thread_cond_t; +#else +#if defined(HG_UTIL_HAS_PTHREAD_CONDATTR_SETCLOCK) && defined(HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE) +#include +#elif defined(HG_UTIL_HAS_SYSTIME_H) +#include +#endif +#include +typedef pthread_cond_t hg_thread_cond_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the condition. + * + * \param cond [IN/OUT] pointer to condition object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_cond_init(hg_thread_cond_t *cond); + +/** + * Destroy the condition. + * + * \param cond [IN/OUT] pointer to condition object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_cond_destroy(hg_thread_cond_t *cond); + +/** + * Wake one thread waiting for the condition to change. + * + * \param cond [IN/OUT] pointer to condition object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_cond_signal(hg_thread_cond_t *cond); + +/** + * Wake all the threads waiting for the condition to change. + * + * \param cond [IN/OUT] pointer to condition object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_cond_broadcast(hg_thread_cond_t *cond); + +/** + * Wait for the condition to change. + * + * \param cond [IN/OUT] pointer to condition object + * \param mutex [IN/OUT] pointer to mutex object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_cond_wait(hg_thread_cond_t *cond, hg_thread_mutex_t *mutex); + +/** + * Wait timeout ms for the condition to change. + * + * \param cond [IN/OUT] pointer to condition object + * \param mutex [IN/OUT] pointer to mutex object + * \param timeout [IN] timeout (in milliseconds) + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_cond_timedwait(hg_thread_cond_t *cond, hg_thread_mutex_t *mutex, + unsigned int timeout); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_cond_signal(hg_thread_cond_t *cond) +{ +#ifdef _WIN32 + WakeConditionVariable(cond); +#else + if (pthread_cond_signal(cond)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_cond_broadcast(hg_thread_cond_t *cond) +{ +#ifdef _WIN32 + WakeAllConditionVariable(cond); +#else + if (pthread_cond_broadcast(cond)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_cond_wait(hg_thread_cond_t *cond, hg_thread_mutex_t *mutex) +{ +#ifdef _WIN32 + if (!SleepConditionVariableCS(cond, mutex, INFINITE)) + return HG_UTIL_FAIL; +#else + if (pthread_cond_wait(cond, mutex)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_cond_timedwait(hg_thread_cond_t *cond, hg_thread_mutex_t *mutex, unsigned int timeout) +{ +#ifdef _WIN32 + if (!SleepConditionVariableCS(cond, mutex, timeout)) + return HG_UTIL_FAIL; +#else +#if defined(HG_UTIL_HAS_PTHREAD_CONDATTR_SETCLOCK) && defined(HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE) + struct timespec now; +#else + struct timeval now; +#endif + struct timespec abs_timeout; + ldiv_t ld; + + /* Need to convert timeout (ms) to absolute time */ +#if defined(HG_UTIL_HAS_PTHREAD_CONDATTR_SETCLOCK) && defined(HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE) + clock_gettime(CLOCK_MONOTONIC_COARSE, &now); + + /* Get sec / nsec */ + ld = ldiv(now.tv_nsec + timeout * 1000000L, 1000000000L); + abs_timeout.tv_nsec = ld.rem; +#elif defined(HG_UTIL_HAS_SYSTIME_H) + gettimeofday(&now, NULL); + + /* Get sec / usec */ + ld = ldiv(now.tv_usec + timeout * 1000L, 1000000L); + abs_timeout.tv_nsec = ld.rem * 1000L; +#endif + abs_timeout.tv_sec = now.tv_sec + ld.quot; + + if (pthread_cond_timedwait(cond, mutex, &abs_timeout)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_THREAD_CONDITION_H */ diff --git a/src/mercury/include/mercury_thread_mutex.h b/src/mercury/include/mercury_thread_mutex.h new file mode 100644 index 00000000000..b400952c884 --- /dev/null +++ b/src/mercury/include/mercury_thread_mutex.h @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_THREAD_MUTEX_H +#define MERCURY_THREAD_MUTEX_H + +#include "mercury_util_config.h" + +#include "mercury_thread_annotation.h" + +#ifdef _WIN32 +#include +#define HG_THREAD_MUTEX_INITIALIZER NULL +typedef CRITICAL_SECTION hg_thread_mutex_t; +#else +#include +#define HG_THREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER +typedef pthread_mutex_t HG_LOCK_CAPABILITY("mutex") hg_thread_mutex_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the mutex. + * + * \param mutex [IN/OUT] pointer to mutex object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_mutex_init(hg_thread_mutex_t *mutex); + +/** + * Initialize the mutex, asking for "fast" mutex. + * + * \param mutex [IN/OUT] pointer to mutex object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_mutex_init_fast(hg_thread_mutex_t *mutex); + +/** + * Destroy the mutex. + * + * \param mutex [IN/OUT] pointer to mutex object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_mutex_destroy(hg_thread_mutex_t *mutex); + +/** + * Lock the mutex. + * + * \param mutex [IN/OUT] pointer to mutex object + */ +static HG_UTIL_INLINE void hg_thread_mutex_lock(hg_thread_mutex_t *mutex) HG_LOCK_ACQUIRE(*mutex); + +/** + * Try locking the mutex. + * + * \param mutex [IN/OUT] pointer to mutex object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_mutex_try_lock(hg_thread_mutex_t *mutex) + HG_LOCK_TRY_ACQUIRE(HG_UTIL_SUCCESS, *mutex); + +/** + * Unlock the mutex. + * + * \param mutex [IN/OUT] pointer to mutex object + */ +static HG_UTIL_INLINE void hg_thread_mutex_unlock(hg_thread_mutex_t *mutex) HG_LOCK_RELEASE(*mutex); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_mutex_lock(hg_thread_mutex_t *mutex) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + EnterCriticalSection(mutex); +#else + (void)pthread_mutex_lock(mutex); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_mutex_try_lock(hg_thread_mutex_t *mutex) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + if (!TryEnterCriticalSection(mutex)) + return HG_UTIL_FAIL; +#else + if (pthread_mutex_trylock(mutex)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_mutex_unlock(hg_thread_mutex_t *mutex) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + LeaveCriticalSection(mutex); +#else + (void)pthread_mutex_unlock(mutex); +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_THREAD_MUTEX_H */ diff --git a/src/mercury/include/mercury_thread_pool.h b/src/mercury/include/mercury_thread_pool.h new file mode 100644 index 00000000000..db973d13937 --- /dev/null +++ b/src/mercury/include/mercury_thread_pool.h @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_THREAD_POOL_H +#define MERCURY_THREAD_POOL_H + +#include "mercury_queue.h" +#include "mercury_thread.h" +#include "mercury_thread_condition.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +typedef struct hg_thread_pool hg_thread_pool_t; + +struct hg_thread_pool { + unsigned int sleeping_worker_count; + HG_QUEUE_HEAD(hg_thread_work) queue; + int shutdown; + hg_thread_mutex_t mutex; + hg_thread_cond_t cond; +}; + +struct hg_thread_work { + hg_thread_func_t func; + void * args; + HG_QUEUE_ENTRY(hg_thread_work) entry; /* Internal */ +}; + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the thread pool. + * + * \param thread_count [IN] number of threads that will be created at + * initialization + * \param pool [OUT] pointer to pool object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_pool_init(unsigned int thread_count, hg_thread_pool_t **pool); + +/** + * Destroy the thread pool. + * + * \param pool [IN/OUT] pointer to pool object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_pool_destroy(hg_thread_pool_t *pool); + +/** + * Post work to the pool. Note that the operation may be queued depending on + * the number of threads and number of tasks already running. + * + * \param pool [IN/OUT] pointer to pool object + * \param work [IN] pointer to work struct + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_pool_post(hg_thread_pool_t *pool, struct hg_thread_work *work); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_pool_post(hg_thread_pool_t *pool, struct hg_thread_work *work) +{ + int ret = HG_UTIL_SUCCESS; + + if (!pool || !work) + return HG_UTIL_FAIL; + + if (!work->func) + return HG_UTIL_FAIL; + + hg_thread_mutex_lock(&pool->mutex); + + /* Are we shutting down ? */ + if (pool->shutdown) { + ret = HG_UTIL_FAIL; + goto unlock; + } + + /* Add task to task queue */ + HG_QUEUE_PUSH_TAIL(&pool->queue, work, entry); + + /* Wake up sleeping worker */ + if (pool->sleeping_worker_count && (hg_thread_cond_signal(&pool->cond) != HG_UTIL_SUCCESS)) + ret = HG_UTIL_FAIL; + +unlock: + hg_thread_mutex_unlock(&pool->mutex); + + return ret; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_THREAD_POOL_H */ diff --git a/src/mercury/include/mercury_thread_rwlock.h b/src/mercury/include/mercury_thread_rwlock.h new file mode 100644 index 00000000000..f03d2aa3372 --- /dev/null +++ b/src/mercury/include/mercury_thread_rwlock.h @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Copyright (C) 2017 Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted for any purpose (including commercial purposes) + * provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions, and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions, and the following disclaimer in the + * documentation and/or materials provided with the distribution. + * + * 3. In addition, redistributions of modified forms of the source or binary + * code must carry prominent notices stating that the original code was + * changed and the date of the change. + * + * 4. All publications or advertising materials mentioning features or use of + * this software are asked, but not required, to acknowledge that it was + * developed by Intel Corporation and credit the contributors. + * + * 5. Neither the name of Intel Corporation, nor the name of any Contributor + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MERCURY_THREAD_RWLOCK_H +#define MERCURY_THREAD_RWLOCK_H + +#include "mercury_util_config.h" + +#include "mercury_thread_annotation.h" + +#ifdef _WIN32 +#include +typedef PSRWLOCK hg_thread_rwlock_t; +#else +#include +typedef pthread_rwlock_t HG_LOCK_CAPABILITY("rwlock") hg_thread_rwlock_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_rwlock_init(hg_thread_rwlock_t *rwlock); + +/** + * Destroy the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_rwlock_destroy(hg_thread_rwlock_t *rwlock); + +/** + * Take a read lock for the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + */ +static HG_UTIL_INLINE void hg_thread_rwlock_rdlock(hg_thread_rwlock_t *rwlock) + HG_LOCK_ACQUIRE_SHARED(*rwlock); + +/** + * Try to take a read lock for the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_rwlock_try_rdlock(hg_thread_rwlock_t *rwlock) + HG_LOCK_TRY_ACQUIRE_SHARED(HG_UTIL_SUCCESS, *rwlock); + +/** + * Release the read lock of the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + */ +static HG_UTIL_INLINE void hg_thread_rwlock_release_rdlock(hg_thread_rwlock_t *rwlock) + HG_LOCK_RELEASE_SHARED(*rwlock); + +/** + * Take a write lock for the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + */ +static HG_UTIL_INLINE void hg_thread_rwlock_wrlock(hg_thread_rwlock_t *rwlock) HG_LOCK_ACQUIRE(*rwlock); + +/** + * Try to take a write lock for the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_rwlock_try_wrlock(hg_thread_rwlock_t *rwlock) + HG_LOCK_TRY_ACQUIRE(HG_UTIL_SUCCESS, *rwlock); + +/** + * Release the write lock of the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + */ +static HG_UTIL_INLINE void hg_thread_rwlock_release_wrlock(hg_thread_rwlock_t *rwlock) + HG_LOCK_RELEASE(*rwlock); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_rwlock_rdlock(hg_thread_rwlock_t *rwlock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + AcquireSRWLockShared(rwlock); +#else + (void)pthread_rwlock_rdlock(rwlock); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_rwlock_try_rdlock(hg_thread_rwlock_t *rwlock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + if (TryAcquireSRWLockShared(rwlock) == 0) + return HG_UTIL_FAIL; +#else + if (pthread_rwlock_tryrdlock(rwlock)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_rwlock_release_rdlock(hg_thread_rwlock_t *rwlock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + ReleaseSRWLockShared(rwlock); +#else + (void)pthread_rwlock_unlock(rwlock); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_rwlock_wrlock(hg_thread_rwlock_t *rwlock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + ReleaseSRWLockExclusive(rwlock); +#else + (void)pthread_rwlock_wrlock(rwlock); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_rwlock_try_wrlock(hg_thread_rwlock_t *rwlock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + if (TryAcquireSRWLockExclusive(rwlock) == 0) + return HG_UTIL_FAIL; +#else + if (pthread_rwlock_trywrlock(rwlock)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_rwlock_release_wrlock(hg_thread_rwlock_t *rwlock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + ReleaseSRWLockExclusive(rwlock); +#else + (void)pthread_rwlock_unlock(rwlock); +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_THREAD_RWLOCK_H */ diff --git a/src/mercury/include/mercury_thread_spin.h b/src/mercury/include/mercury_thread_spin.h new file mode 100644 index 00000000000..36ce5f8ef32 --- /dev/null +++ b/src/mercury/include/mercury_thread_spin.h @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_THREAD_SPIN_H +#define MERCURY_THREAD_SPIN_H + +#include "mercury_util_config.h" + +#include "mercury_thread_annotation.h" + +#if defined(_WIN32) +#include +typedef volatile LONG hg_thread_spin_t; +#elif defined(HG_UTIL_HAS_PTHREAD_SPINLOCK_T) +#include +typedef pthread_spinlock_t HG_LOCK_CAPABILITY("spin") hg_thread_spin_t; +#else +/* Default to hg_thread_mutex_t if pthread_spinlock_t is not supported */ +#include "mercury_thread_mutex.h" +typedef hg_thread_mutex_t HG_LOCK_CAPABILITY("mutex") hg_thread_spin_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the spin lock. + * + * \param lock [IN/OUT] pointer to lock object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_spin_init(hg_thread_spin_t *lock); + +/** + * Destroy the spin lock. + * + * \param lock [IN/OUT] pointer to lock object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_spin_destroy(hg_thread_spin_t *lock); + +/** + * Lock the spin lock. + * + * \param lock [IN/OUT] pointer to lock object + */ +static HG_UTIL_INLINE void hg_thread_spin_lock(hg_thread_spin_t *lock) HG_LOCK_ACQUIRE(*lock); + +/** + * Try locking the spin lock. + * + * \param mutex [IN/OUT] pointer to lock object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_spin_try_lock(hg_thread_spin_t *lock) + HG_LOCK_TRY_ACQUIRE(HG_UTIL_SUCCESS, *lock); + +/** + * Unlock the spin lock. + * + * \param mutex [IN/OUT] pointer to lock object + */ +static HG_UTIL_INLINE void hg_thread_spin_unlock(hg_thread_spin_t *lock) HG_LOCK_RELEASE(*lock); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_spin_lock(hg_thread_spin_t *lock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#if defined(_WIN32) + while (InterlockedExchange(lock, EBUSY)) { + /* Don't lock while waiting */ + while (*lock) { + YieldProcessor(); + + /* Compiler barrier. Prevent caching of *lock */ + MemoryBarrier(); + } + } +#elif defined(HG_UTIL_HAS_PTHREAD_SPINLOCK_T) + (void)pthread_spin_lock(lock); +#else + hg_thread_mutex_lock(lock); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_spin_try_lock(hg_thread_spin_t *lock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#if defined(_WIN32) + return InterlockedExchange(lock, EBUSY); +#elif defined(HG_UTIL_HAS_PTHREAD_SPINLOCK_T) + if (pthread_spin_trylock(lock)) + return HG_UTIL_FAIL; + + return HG_UTIL_SUCCESS; +#else + return hg_thread_mutex_try_lock(lock); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_spin_unlock(hg_thread_spin_t *lock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#if defined(_WIN32) + /* Compiler barrier. The store below acts with release semantics */ + MemoryBarrier(); + *lock = 0; +#elif defined(HG_UTIL_HAS_PTHREAD_SPINLOCK_T) + (void)pthread_spin_unlock(lock); +#else + hg_thread_mutex_unlock(lock); +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_THREAD_SPIN_H */ diff --git a/src/mercury/include/mercury_time.h b/src/mercury/include/mercury_time.h new file mode 100644 index 00000000000..f158638342c --- /dev/null +++ b/src/mercury/include/mercury_time.h @@ -0,0 +1,503 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_TIME_H +#define MERCURY_TIME_H + +#include "mercury_util_config.h" + +#if defined(_WIN32) +#include +#elif defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) +#include +#elif defined(__APPLE__) && defined(HG_UTIL_HAS_SYSTIME_H) +#include +#include +#else +#include +#include +#if defined(HG_UTIL_HAS_SYSTIME_H) +#include +#else +#error "Not supported on this platform." +#endif +#endif + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) +typedef struct timespec hg_time_t; +#else +typedef struct hg_time hg_time_t; + +struct hg_time { + long tv_sec; + long tv_usec; +}; +#endif + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Get an elapsed time on the calling processor. + * + * \param tv [OUT] pointer to returned time structure + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_time_get_current(hg_time_t *tv); + +/** + * Get an elapsed time on the calling processor (resolution is ms). + * + * \param tv [OUT] pointer to returned time structure + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_time_get_current_ms(hg_time_t *tv); + +/** + * Convert hg_time_t to double. + * + * \param tv [IN] time structure + * + * \return Converted time in seconds + */ +static HG_UTIL_INLINE double hg_time_to_double(hg_time_t tv); + +/** + * Convert double to hg_time_t. + * + * \param d [IN] time in seconds + * + * \return Converted time structure + */ +static HG_UTIL_INLINE hg_time_t hg_time_from_double(double d); + +/** + * Convert (integer) milliseconds to hg_time_t. + * + * \param ms [IN] time in milliseconds + * + * \return Converted time structure + */ +static HG_UTIL_INLINE hg_time_t hg_time_from_ms(unsigned int ms); + +/** + * Convert hg_time_t to (integer) milliseconds. + * + * \param tv [IN] time structure + * + * \return Time in milliseconds + */ +static HG_UTIL_INLINE unsigned int hg_time_to_ms(hg_time_t tv); + +/** + * Compare time values. + * + * \param in1 [IN] time structure + * \param in2 [IN] time structure + * + * \return 1 if in1 < in2, 0 otherwise + */ +static HG_UTIL_INLINE int hg_time_less(hg_time_t in1, hg_time_t in2); + +/** + * Diff time values and return the number of seconds elapsed between + * time \in2 and time \in1. + * + * \param in2 [IN] time structure + * \param in1 [IN] time structure + * + * \return Subtracted time + */ +static HG_UTIL_INLINE double hg_time_diff(hg_time_t in2, hg_time_t in1); + +/** + * Add time values. + * + * \param in1 [IN] time structure + * \param in2 [IN] time structure + * + * \return Summed time structure + */ +static HG_UTIL_INLINE hg_time_t hg_time_add(hg_time_t in1, hg_time_t in2); + +/** + * Subtract time values. + * + * \param in1 [IN] time structure + * \param in2 [IN] time structure + * + * \return Subtracted time structure + */ +static HG_UTIL_INLINE hg_time_t hg_time_subtract(hg_time_t in1, hg_time_t in2); + +/** + * Sleep until the time specified in rqt has elapsed. + * + * \param reqt [IN] time structure + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_time_sleep(const hg_time_t rqt); + +/** + * Get a string containing current time/date stamp. + * + * \return Valid string or NULL on failure + */ +static HG_UTIL_INLINE char *hg_time_stamp(void); + +/*---------------------------------------------------------------------------*/ +#ifdef _WIN32 +static HG_UTIL_INLINE LARGE_INTEGER +get_FILETIME_offset(void) +{ + SYSTEMTIME s; + FILETIME f; + LARGE_INTEGER t; + + s.wYear = 1970; + s.wMonth = 1; + s.wDay = 1; + s.wHour = 0; + s.wMinute = 0; + s.wSecond = 0; + s.wMilliseconds = 0; + SystemTimeToFileTime(&s, &f); + t.QuadPart = f.dwHighDateTime; + t.QuadPart <<= 32; + t.QuadPart |= f.dwLowDateTime; + + return t; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_get_current(hg_time_t *tv) +{ + LARGE_INTEGER t; + FILETIME f; + double t_usec; + static LARGE_INTEGER offset; + static double freq_to_usec; + static int initialized = 0; + static BOOL use_perf_counter = 0; + + if (!initialized) { + LARGE_INTEGER perf_freq; + initialized = 1; + use_perf_counter = QueryPerformanceFrequency(&perf_freq); + if (use_perf_counter) { + QueryPerformanceCounter(&offset); + freq_to_usec = (double)perf_freq.QuadPart / 1000000.; + } + else { + offset = get_FILETIME_offset(); + freq_to_usec = 10.; + } + } + if (use_perf_counter) { + QueryPerformanceCounter(&t); + } + else { + GetSystemTimeAsFileTime(&f); + t.QuadPart = f.dwHighDateTime; + t.QuadPart <<= 32; + t.QuadPart |= f.dwLowDateTime; + } + + t.QuadPart -= offset.QuadPart; + t_usec = (double)t.QuadPart / freq_to_usec; + t.QuadPart = t_usec; + tv->tv_sec = t.QuadPart / 1000000; + tv->tv_usec = t.QuadPart % 1000000; + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_get_current_ms(hg_time_t *tv) +{ + return hg_time_get_current(tv); +} + +/*---------------------------------------------------------------------------*/ +#elif defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) +static HG_UTIL_INLINE int +hg_time_get_current(hg_time_t *tv) +{ + clock_gettime(CLOCK_MONOTONIC, tv); + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_get_current_ms(hg_time_t *tv) +{ +/* ppc/32 and ppc/64 do not support CLOCK_MONOTONIC_COARSE in vdso */ +#if defined(__ppc64__) || defined(__ppc__) || defined(__PPC64__) || defined(__PPC__) || \ + !defined(HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE) + clock_gettime(CLOCK_MONOTONIC, tv); +#else + /* We don't need fine grain time stamps, _COARSE resolution is 1ms */ + clock_gettime(CLOCK_MONOTONIC_COARSE, tv); +#endif + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +#elif defined(__APPLE__) && defined(HG_UTIL_HAS_SYSTIME_H) +static HG_UTIL_INLINE int +hg_time_get_current(hg_time_t *tv) +{ + static uint64_t monotonic_timebase_factor = 0; + uint64_t monotonic_nsec; + + if (monotonic_timebase_factor == 0) { + mach_timebase_info_data_t timebase_info; + + (void)mach_timebase_info(&timebase_info); + monotonic_timebase_factor = timebase_info.numer / timebase_info.denom; + } + monotonic_nsec = (mach_absolute_time() * monotonic_timebase_factor); + tv->tv_sec = (long)(monotonic_nsec / 1000000000); + tv->tv_usec = (long)((monotonic_nsec - (uint64_t)tv->tv_sec) / 1000); + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_get_current_ms(hg_time_t *tv) +{ + return hg_time_get_current(tv); +} + +#else +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_get_current(hg_time_t *tv) +{ + gettimeofday((struct timeval *)tv, NULL); + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_get_current_ms(hg_time_t *tv) +{ + return hg_time_get_current(tv); +} + +#endif +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE double +hg_time_to_double(hg_time_t tv) +{ +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + return (double)tv.tv_sec + (double)(tv.tv_nsec) * 0.000000001; +#else + return (double)tv.tv_sec + (double)(tv.tv_usec) * 0.000001; +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_time_t +hg_time_from_double(double d) +{ + hg_time_t tv; + + tv.tv_sec = (long)d; +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + tv.tv_nsec = (long)((d - (double)(tv.tv_sec)) * 1000000000); +#else + tv.tv_usec = (long)((d - (double)(tv.tv_sec)) * 1000000); +#endif + + return tv; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE unsigned int +hg_time_to_ms(hg_time_t tv) +{ +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + return (unsigned int)(tv.tv_sec * 1000 + tv.tv_nsec / 1000000); +#else + return (unsigned int)(tv.tv_sec * 1000 + tv.tv_usec / 1000); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_time_t +hg_time_from_ms(unsigned int ms) +{ +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + return (hg_time_t){.tv_sec = ms / 1000, .tv_nsec = (ms - (ms / 1000) * 1000) * 1000000}; +#else + return (hg_time_t){.tv_sec = ms / 1000, .tv_usec = (ms - (ms / 1000) * 1000) * 1000}; +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_less(hg_time_t in1, hg_time_t in2) +{ + return ((in1.tv_sec < in2.tv_sec) || ((in1.tv_sec == in2.tv_sec) && +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + (in1.tv_nsec < in2.tv_nsec))); +#else + (in1.tv_usec < in2.tv_usec))); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE double +hg_time_diff(hg_time_t in2, hg_time_t in1) +{ +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + return ((double)in2.tv_sec + (double)(in2.tv_nsec) * 0.000000001) - + ((double)in1.tv_sec + (double)(in1.tv_nsec) * 0.000000001); +#else + return ((double)in2.tv_sec + (double)(in2.tv_usec) * 0.000001) - + ((double)in1.tv_sec + (double)(in1.tv_usec) * 0.000001); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_time_t +hg_time_add(hg_time_t in1, hg_time_t in2) +{ + hg_time_t out; + + out.tv_sec = in1.tv_sec + in2.tv_sec; +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + out.tv_nsec = in1.tv_nsec + in2.tv_nsec; + if (out.tv_nsec > 1000000000) { + out.tv_nsec -= 1000000000; + out.tv_sec += 1; + } +#else + out.tv_usec = in1.tv_usec + in2.tv_usec; + if (out.tv_usec > 1000000) { + out.tv_usec -= 1000000; + out.tv_sec += 1; + } +#endif + + return out; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_time_t +hg_time_subtract(hg_time_t in1, hg_time_t in2) +{ + hg_time_t out; + + out.tv_sec = in1.tv_sec - in2.tv_sec; +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + out.tv_nsec = in1.tv_nsec - in2.tv_nsec; + if (out.tv_nsec < 0) { + out.tv_nsec += 1000000000; + out.tv_sec -= 1; + } +#else + out.tv_usec = in1.tv_usec - in2.tv_usec; + if (out.tv_usec < 0) { + out.tv_usec += 1000000; + out.tv_sec -= 1; + } +#endif + + return out; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_sleep(const hg_time_t rqt) +{ +#ifdef _WIN32 + DWORD dwMilliseconds = (DWORD)(hg_time_to_double(rqt) / 1000); + + Sleep(dwMilliseconds); +#elif defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + if (nanosleep(&rqt, NULL)) + return HG_UTIL_FAIL; +#else + useconds_t usec = (useconds_t)rqt.tv_sec * 1000000 + (useconds_t)rqt.tv_usec; + + if (usleep(usec)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +#define HG_UTIL_STAMP_MAX 128 +static HG_UTIL_INLINE char * +hg_time_stamp(void) +{ + static char buf[HG_UTIL_STAMP_MAX] = {'\0'}; + +#if defined(_WIN32) + /* TODO not implemented */ +#elif defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + struct tm *local_time; + time_t t; + + t = time(NULL); + local_time = localtime(&t); + if (local_time == NULL) + return NULL; + + if (strftime(buf, HG_UTIL_STAMP_MAX, "%a, %d %b %Y %T %Z", local_time) == 0) + return NULL; +#else + struct timeval tv; + struct timezone tz; + unsigned long days, hours, minutes, seconds; + + gettimeofday(&tv, &tz); + days = (unsigned long)tv.tv_sec / (3600 * 24); + hours = ((unsigned long)tv.tv_sec - days * 24 * 3600) / 3600; + minutes = ((unsigned long)tv.tv_sec - days * 24 * 3600 - hours * 3600) / 60; + seconds = (unsigned long)tv.tv_sec - days * 24 * 3600 - hours * 3600 - minutes * 60; + hours -= (unsigned long)tz.tz_minuteswest / 60; + + snprintf(buf, HG_UTIL_STAMP_MAX, "%02lu:%02lu:%02lu (GMT-%d)", hours, minutes, seconds, + tz.tz_minuteswest / 60); +#endif + + return buf; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_TIME_H */ diff --git a/src/mercury/include/mercury_types.h b/src/mercury/include/mercury_types.h new file mode 100644 index 00000000000..7ea6b174ed1 --- /dev/null +++ b/src/mercury/include/mercury_types.h @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_TYPES_H +#define MERCURY_TYPES_H + +#include "mercury_core_types.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +typedef struct hg_class hg_class_t; /* Opaque HG class */ +typedef struct hg_context hg_context_t; /* Opaque HG context */ +typedef struct hg_addr * hg_addr_t; /* Abstract HG address */ +typedef struct hg_handle *hg_handle_t; /* Abstract RPC handle */ +typedef struct hg_bulk * hg_bulk_t; /* Abstract bulk data handle */ +typedef struct hg_proc * hg_proc_t; /* Abstract serialization processor */ +typedef struct hg_op_id * hg_op_id_t; /* Abstract operation id */ + +/* HG info struct */ +struct hg_info { + hg_class_t * hg_class; /* HG class */ + hg_context_t *context; /* HG context */ + hg_addr_t addr; /* HG address at target/origin */ + hg_id_t id; /* RPC ID */ + hg_uint8_t context_id; /* Context ID at target/origin */ +}; + +/** + * Bulk transfer operators. + */ +typedef enum { + HG_BULK_PUSH, /*!< push data to origin */ + HG_BULK_PULL /*!< pull data from origin */ +} hg_bulk_op_t; + +/* Callback info structs */ +struct hg_cb_info_lookup { + hg_addr_t addr; /* HG address */ +}; + +struct hg_cb_info_forward { + hg_handle_t handle; /* HG handle */ +}; + +struct hg_cb_info_respond { + hg_handle_t handle; /* HG handle */ +}; + +struct hg_cb_info_bulk { + hg_bulk_t origin_handle; /* HG Bulk origin handle */ + hg_bulk_t local_handle; /* HG Bulk local handle */ + hg_bulk_op_t op; /* Operation type */ + hg_size_t size; /* Total size transferred */ +}; + +struct hg_cb_info { + union { /* Union of callback info structures */ + struct hg_cb_info_lookup lookup; + struct hg_cb_info_forward forward; + struct hg_cb_info_respond respond; + struct hg_cb_info_bulk bulk; + } info; + void * arg; /* User data */ + hg_cb_type_t type; /* Callback type */ + hg_return_t ret; /* Return value */ +}; + +/* RPC / HG callbacks */ +typedef hg_return_t (*hg_rpc_cb_t)(hg_handle_t handle); +typedef hg_return_t (*hg_cb_t)(const struct hg_cb_info *callback_info); + +/* Proc callback for serializing/deserializing parameters */ +typedef hg_return_t (*hg_proc_cb_t)(hg_proc_t proc, void *data); + +/*****************/ +/* Public Macros */ +/*****************/ + +/* Constant values */ +#define HG_ADDR_NULL ((hg_addr_t)0) +#define HG_HANDLE_NULL ((hg_handle_t)0) +#define HG_BULK_NULL ((hg_bulk_t)0) +#define HG_PROC_NULL ((hg_proc_t)0) +#define HG_OP_ID_NULL ((hg_op_id_t)0) +#define HG_OP_ID_IGNORE ((hg_op_id_t *)1) + +#endif /* MERCURY_TYPES_H */ diff --git a/src/mercury/include/mercury_util.h b/src/mercury/include/mercury_util.h new file mode 100644 index 00000000000..1e36e266049 --- /dev/null +++ b/src/mercury/include/mercury_util.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_UTIL_LOG_H +#define MERCURY_UTIL_LOG_H + +#include "mercury_util_config.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Set the log level for HG util. That setting is valid for all HG classes. + * + * \param level [IN] level string, valid values are: + * "none", "error", "warning", "debug" + */ +HG_UTIL_PUBLIC void HG_Util_set_log_level(const char *level); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_UTIL_LOG_H */ diff --git a/src/mercury/include/mercury_util_config.h b/src/mercury/include/mercury_util_config.h new file mode 100644 index 00000000000..8237b4df409 --- /dev/null +++ b/src/mercury/include/mercury_util_config.h @@ -0,0 +1,154 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Generated file. Only edit mercury_util_config.h.in. */ + +#ifndef MERCURY_UTIL_CONFIG_H +#define MERCURY_UTIL_CONFIG_H + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/* Type definitions */ +#ifdef _WIN32 +typedef signed __int64 hg_util_int64_t; +typedef signed __int32 hg_util_int32_t; +typedef signed __int16 hg_util_int16_t; +typedef signed __int8 hg_util_int8_t; +typedef unsigned __int64 hg_util_uint64_t; +typedef unsigned __int32 hg_util_uint32_t; +typedef unsigned __int16 hg_util_uint16_t; +typedef unsigned __int8 hg_util_uint8_t; +#else +#include +#include +typedef int64_t hg_util_int64_t; +typedef int32_t hg_util_int32_t; +typedef int16_t hg_util_int16_t; +typedef int8_t hg_util_int8_t; +typedef uint64_t hg_util_uint64_t; +typedef uint32_t hg_util_uint32_t; +typedef uint16_t hg_util_uint16_t; +typedef uint8_t hg_util_uint8_t; +#endif +typedef hg_util_uint8_t hg_util_bool_t; +typedef hg_util_uint64_t hg_util_ptr_t; + +/* True / false */ +#define HG_UTIL_TRUE 1 +#define HG_UTIL_FALSE 0 + +/* Return codes */ +#define HG_UTIL_SUCCESS 0 +#define HG_UTIL_FAIL -1 + +/*****************/ +/* Public Macros */ +/*****************/ + +/* Visibility of symbols */ +#if defined(_WIN32) +#define HG_UTIL_ABI_IMPORT __declspec(dllimport) +#define HG_UTIL_ABI_EXPORT __declspec(dllexport) +#define HG_UTIL_ABI_HIDDEN +#elif defined(__GNUC__) && (__GNUC__ >= 4) +#define HG_UTIL_ABI_IMPORT __attribute__((visibility("default"))) +#define HG_UTIL_ABI_EXPORT __attribute__((visibility("default"))) +#define HG_UTIL_ABI_HIDDEN __attribute__((visibility("hidden"))) +#else +#define HG_UTIL_ABI_IMPORT +#define HG_UTIL_ABI_EXPORT +#define HG_UTIL_ABI_HIDDEN +#endif + +/* Inline macro */ +#ifdef _WIN32 +#define HG_UTIL_INLINE __inline +#else +#define HG_UTIL_INLINE __inline__ +#endif + +/* Check format arguments */ +#if defined(__GNUC__) +#define HG_UTIL_PRINTF_LIKE(_fmt, _firstarg) __attribute__((format(printf, _fmt, _firstarg))) +#else +#define HG_UTIL_PRINTF_LIKE(_fmt, _firstarg) +#endif + +/* Shared libraries */ +/* #undef HG_UTIL_BUILD_SHARED_LIBS */ +#ifdef HG_UTIL_BUILD_SHARED_LIBS +#ifdef mercury_util_EXPORTS +#define HG_UTIL_PUBLIC HG_UTIL_ABI_EXPORT +#else +#define HG_UTIL_PUBLIC HG_UTIL_ABI_IMPORT +#endif +#define HG_UTIL_PRIVATE HG_UTIL_ABI_HIDDEN +#else +#define HG_UTIL_PUBLIC +#define HG_UTIL_PRIVATE +#endif + +/* Define if has __attribute__((constructor)) */ +#define HG_UTIL_HAS_ATTR_CONSTRUCTOR + +/* Define if has __attribute__((constructor(priority))) */ +#define HG_UTIL_HAS_ATTR_CONSTRUCTOR_PRIORITY + +/* Define if has 'clock_gettime()' */ +#define HG_UTIL_HAS_CLOCK_GETTIME + +/* Define if has CLOCK_MONOTONIC_COARSE */ +#define HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE + +/* Define is has debug */ +/* #undef HG_UTIL_HAS_DEBUG */ + +/* Define if has eventfd_t type */ +#define HG_UTIL_HAS_EVENTFD_T + +/* Define if has colored output */ +/* #undef HG_UTIL_HAS_LOG_COLOR */ + +/* Define if has */ +/* #undef HG_UTIL_HAS_OPA_PRIMITIVES_H */ + +/* Define if has 'pthread_condattr_setclock()' */ +#define HG_UTIL_HAS_PTHREAD_CONDATTR_SETCLOCK + +/* Define if has PTHREAD_MUTEX_ADAPTIVE_NP */ +#define HG_UTIL_HAS_PTHREAD_MUTEX_ADAPTIVE_NP + +/* Define if has pthread_spinlock_t type */ +#define HG_UTIL_HAS_PTHREAD_SPINLOCK_T + +/* Define if has */ +#define HG_UTIL_HAS_STDATOMIC_H + +/* Define type size of atomic_long */ +#define HG_UTIL_ATOMIC_LONG_WIDTH 8 + +/* Define if has */ +#define HG_UTIL_HAS_SYSEPOLL_H + +/* Define if has */ +/* #undef HG_UTIL_HAS_SYSEVENT_H */ + +/* Define if has */ +#define HG_UTIL_HAS_SYSEVENTFD_H + +/* Define if has */ +#define HG_UTIL_HAS_SYSTIME_H + +/* Define if has */ +#define HG_UTIL_HAS_TIME_H + +#endif /* MERCURY_UTIL_CONFIG_H */ diff --git a/src/mercury/include/na.h b/src/mercury/include/na.h new file mode 100644 index 00000000000..6f75b283ed8 --- /dev/null +++ b/src/mercury/include/na.h @@ -0,0 +1,1064 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef NA_H +#define NA_H + +#include "na_types.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/* See na_types.h */ + +/*****************/ +/* Public Macros */ +/*****************/ + +/* See na_types.h */ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the network abstraction layer. + * Must be finalized with NA_Finalize(). + * + * \param info_string [IN] host address with port number (e.g., + * "tcp://localhost:3344" or + * "bmi+tcp://localhost:3344") + * \param listen [IN] listen for incoming connections + * + * \return Pointer to NA class or NULL in case of failure + */ +NA_PUBLIC na_class_t *NA_Initialize(const char *info_string, na_bool_t listen) NA_WARN_UNUSED_RESULT; + +/** + * Initialize the network abstraction layer with options provided by init_info. + * Must be finalized with NA_Finalize(). + * + * \param info_string [IN] host address with port number (e.g., + * "tcp://localhost:3344" or + * "bmi+tcp://localhost:3344") + * \param listen [IN] listen for incoming connections + * \param na_init_info [IN] (Optional) NA init info, NULL if no info + * + * \return Pointer to NA class or NULL in case of failure + */ +NA_PUBLIC na_class_t *NA_Initialize_opt(const char *info_string, na_bool_t listen, + const struct na_init_info *na_init_info) NA_WARN_UNUSED_RESULT; + +/** + * Finalize the network abstraction layer. + * + * \param na_class [IN/OUT] pointer to NA class + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Finalize(na_class_t *na_class); + +/** + * Clean up all temporary files that were created in previous NA instances. + * While temporary resources (e.g., tmp files) are cleaned up on a call + * to NA_Finalize(), this routine gives a chance to programs that terminate + * abnormally to easily clean up those resources. This includes instances + * from all plugins. + */ +NA_PUBLIC void NA_Cleanup(void); + +/** + * Set the log level for NA. That setting is valid for all NA classes. + * + * \param level [IN] level string, valid values are: + * "none", "error", "warning", "debug" + */ +NA_PUBLIC void NA_Set_log_level(const char *level); + +/** + * Return the name of the NA class. + * + * \param na_class [IN] pointer to NA class + * + * \return Pointer to NA class name or NULL in case of failure + */ +static NA_INLINE const char *NA_Get_class_name(const na_class_t *na_class) NA_WARN_UNUSED_RESULT; + +/** + * Return the protocol of the NA class. + * + * \param na_class [IN] pointer to NA class + * + * \return Pointer to NA class protocol or NULL in case of failure + */ +static NA_INLINE const char *NA_Get_class_protocol(const na_class_t *na_class) NA_WARN_UNUSED_RESULT; + +/** + * Test whether class is listening or not. + * + * \param na_class [IN] pointer to NA class + * + * \return NA_TRUE if listening or NA_FALSE if not + */ +static NA_INLINE na_bool_t NA_Is_listening(const na_class_t *na_class) NA_WARN_UNUSED_RESULT; + +/** + * Create a new context. + * + * \param na_class [IN/OUT] pointer to NA class + * + * \return Pointer to NA context or NULL in case of failure + */ +NA_PUBLIC na_context_t *NA_Context_create(na_class_t *na_class) NA_WARN_UNUSED_RESULT; + +/** + * Create a new context with a specific ID. + * + * \param na_class [IN/OUT] pointer to NA class + * \param id [IN] context ID + * + * \return Pointer to NA context or NULL in case of failure + */ +NA_PUBLIC na_context_t *NA_Context_create_id(na_class_t *na_class, na_uint8_t id) NA_WARN_UNUSED_RESULT; + +/** + * Destroy a context created by using NA_Context_create(). + * + * \param na_class [IN/OUT] pointer to NA class + * \param context [IN/OUT] pointer to context of execution + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Context_destroy(na_class_t *na_class, na_context_t *context); + +/** + * Allocate an operation ID for the higher level layer to save and + * pass back to the NA layer rather than have the NA layer allocate operation + * IDs all the time. + * Allocating an operation ID gives ownership of that ID to the higher level + * layer, hence it must be explicitly released with NA_Op_destroy() when it + * is no longer needed. + * + * \param na_class [IN/OUT] pointer to NA class + * + * \return valid pointer to operation ID or NULL + */ +NA_PUBLIC na_op_id_t *NA_Op_create(na_class_t *na_class); + +/** + * Destroy operation ID created with NA_Op_create(). + * Reference counting prevents involuntary free. + * + * \param na_class [IN/OUT] pointer to NA class + * \param op_id [IN] pointer to operation ID + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Op_destroy(na_class_t *na_class, na_op_id_t *op_id); + +/** + * Lookup an addr from a peer address/name. Addresses need to be + * freed by calling NA_Addr_free(). + * + * \param na_class [IN/OUT] pointer to NA class + * \param name [IN] lookup name + * \param addr [OUT] pointer to abstract address + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Addr_lookup(na_class_t *na_class, const char *name, na_addr_t *addr); + +/** + * Free the addr from the list of peers. + * + * \param na_class [IN/OUT] pointer to NA class + * \param addr [IN] abstract address + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Addr_free(na_class_t *na_class, na_addr_t addr); + +/** + * Hint that the address is no longer valid. This may happen if the peer is + * no longer responding. This can be used to force removal of the + * peer address from the list of the peers, before freeing it and reclaim + * resources. + * + * \param na_class [IN/OUT] pointer to NA class + * \param addr [IN] abstract address + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Addr_set_remove(na_class_t *na_class, na_addr_t addr); + +/** + * Access self address. + * + * \param na_class [IN/OUT] pointer to NA class + * \param addr [OUT] pointer to abstract address + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Addr_self(na_class_t *na_class, na_addr_t *addr); + +/** + * Duplicate an existing NA abstract address. The duplicated address can be + * stored for later use and the origin address be freed safely. The duplicated + * address must be freed with NA_Addr_free(). + * + * \param na_class [IN/OUT] pointer to NA class + * \param addr [IN] abstract address + * \param new_addr [OUT] pointer to abstract address + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Addr_dup(na_class_t *na_class, na_addr_t addr, na_addr_t *new_addr); + +/** + * Compare two addresses. + * + * \param na_class [IN/OUT] pointer to NA class + * \param addr1 [IN] abstract address + * \param addr2 [IN] abstract address + * + * \return NA_TRUE if addresses are determined to be equal, NA_FALSE otherwise + */ +NA_PUBLIC na_bool_t NA_Addr_cmp(na_class_t *na_class, na_addr_t addr1, na_addr_t addr2); + +/** + * Test whether address is self or not. + * + * \param na_class [IN/OUT] pointer to NA class + * \param addr [IN] abstract address + * + * \return NA_TRUE if self or NA_FALSE if not + */ +static NA_INLINE na_bool_t NA_Addr_is_self(na_class_t *na_class, na_addr_t addr); + +/** + * Convert an addr to a string (returned string includes the terminating + * null byte '\0'). If buf is NULL, the address is not converted and only + * the required size of the buffer is returned. If the input value passed + * through buf_size is too small, NA_OVERFLOW is returned and the buf_size + * output is set to the minimum size required. + * + * \param na_class [IN/OUT] pointer to NA class + * \param buf [IN/OUT] pointer to destination buffer + * \param buf_size [IN/OUT] pointer to buffer size + * \param addr [IN] abstract address + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Addr_to_string(na_class_t *na_class, char *buf, na_size_t *buf_size, na_addr_t addr); + +/** + * Get size required to serialize address. + * + * \param na_class [IN/OUT] pointer to NA class + * \param addr [IN] abstract address + * + * \return Non-negative value + */ +static NA_INLINE na_size_t NA_Addr_get_serialize_size(na_class_t *na_class, + na_addr_t addr) NA_WARN_UNUSED_RESULT; + +/** + * Serialize address into a buffer. + * + * \param na_class [IN/OUT] pointer to NA class + * \param buf [IN/OUT] pointer to buffer used for serialization + * \param buf_size [IN] buffer size + * \param addr [IN] abstract address + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Addr_serialize(na_class_t *na_class, void *buf, na_size_t buf_size, na_addr_t addr); + +/** + * Deserialize address from a buffer. The returned address must be freed with + * NA_Addr_free(). + * + * \param na_class [IN/OUT] pointer to NA class + * \param addr [OUT] pointer to abstract address + * \param buf [IN] pointer to buffer used for deserialization + * \param buf_size [IN] buffer size + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Addr_deserialize(na_class_t *na_class, na_addr_t *addr, const void *buf, + na_size_t buf_size); + +/** + * Get the maximum size of messages supported by unexpected send/recv. + * Small message size. + * + * \param na_class [IN] pointer to NA class + * + * \return Non-negative value + */ +static NA_INLINE na_size_t NA_Msg_get_max_unexpected_size(const na_class_t *na_class) NA_WARN_UNUSED_RESULT; + +/** + * Get the maximum size of messages supported by expected send/recv. + * Small message size that may differ from the unexpected message size. + * + * \param na_class [IN] pointer to NA class + * + * \return Non-negative value + */ +static NA_INLINE na_size_t NA_Msg_get_max_expected_size(const na_class_t *na_class) NA_WARN_UNUSED_RESULT; + +/** + * Get the header size for unexpected messages. Plugins may use that header + * to encode specific information (such as source addr, etc). + * + * \param na_class [IN] pointer to NA class + * + * \return Non-negative value + */ +static NA_INLINE na_size_t NA_Msg_get_unexpected_header_size(const na_class_t *na_class) + NA_WARN_UNUSED_RESULT; + +/** + * Get the header size for expected messages. Plugins may use that header + * to encode specific information. + * + * \param na_class [IN] pointer to NA class + * + * \return Non-negative value + */ +static NA_INLINE na_size_t NA_Msg_get_expected_header_size(const na_class_t *na_class) NA_WARN_UNUSED_RESULT; + +/** + * Get the maximum tag value that can be used by send/recv (both expected and + * unexpected). + * + * \param na_class [IN] pointer to NA class + * + * \return Non-negative value + */ +static NA_INLINE na_tag_t NA_Msg_get_max_tag(const na_class_t *na_class) NA_WARN_UNUSED_RESULT; + +/** + * Allocate buf_size bytes and return a pointer to the allocated memory. + * If size is 0, NA_Msg_buf_alloc() returns NULL. The plugin_data output + * parameter can be used by the underlying plugin implementation to store + * internal memory information. + * + * \param na_class [IN/OUT] pointer to NA class + * \param buf_size [IN] buffer size + * \param plugin_data [OUT] pointer to internal plugin data + * + * \return Pointer to allocated memory or NULL in case of failure + */ +NA_PUBLIC void *NA_Msg_buf_alloc(na_class_t *na_class, na_size_t buf_size, + void **plugin_data) NA_WARN_UNUSED_RESULT; + +/** + * The NA_Msg_buf_free() function releases the memory space pointed to by buf, + * which must have been returned by a previous call to NA_Msg_buf_alloc(). + * If buf is NULL, no operation is performed. + * + * \param na_class [IN/OUT] pointer to NA class + * \param buf [IN] pointer to buffer + * \param plugin_data [IN] pointer to internal plugin data + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Msg_buf_free(na_class_t *na_class, void *buf, void *plugin_data); + +/** + * Initialize a buffer so that it can be safely passed to the + * NA_Msg_send_unexpected() call. In the case the underlying plugin adds its + * own header to that buffer, the header will be written at this time and the + * usable buffer payload will be buf + NA_Msg_get_unexpected_header_size(). + * + * \param na_class [IN/OUT] pointer to NA class + * \param buf [IN] pointer to buffer + * \param buf_size [IN] buffer size + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Msg_init_unexpected(na_class_t *na_class, void *buf, na_size_t buf_size); + +/** + * Send an unexpected message to dest_addr. Unexpected sends do not require a + * matching receive to complete. After completion, the user callback is + * placed into the context completion queue and can be triggered using + * NA_Trigger(). + * The plugin_data parameter returned from the NA_Msg_buf_alloc() call must + * be passed along with the buffer, it allows plugins to store and retrieve + * additional buffer information such as memory descriptors. + * \remark Note also that unexpected messages do not require an unexpected + * receive to be posted at the destination before sending the message and the + * destination is allowed to drop the message without notification. However, + * in general, NA plugins are encouraged to remain reliable to avoid unnecessary + * timeouts and cancellations. + * + * Users must manually create an operation ID through NA_Op_create() and pass + * it through op_id for future use and prevent multiple ID creation. + * + * \param na_class [IN/OUT] pointer to NA class + * \param context [IN/OUT] pointer to context of execution + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param buf [IN] pointer to send buffer + * \param buf_size [IN] buffer size + * \param plugin_data [IN] pointer to internal plugin data + * \param dest_addr [IN] abstract address of destination + * \param dest_id [IN] destination context ID + * \param tag [IN] tag attached to message + * \param op_id [IN/OUT] pointer to operation ID + * + * \return NA_SUCCESS or corresponding NA error code + */ +static NA_INLINE na_return_t NA_Msg_send_unexpected(na_class_t *na_class, na_context_t *context, + na_cb_t callback, void *arg, const void *buf, + na_size_t buf_size, void *plugin_data, + na_addr_t dest_addr, na_uint8_t dest_id, na_tag_t tag, + na_op_id_t *op_id); + +/** + * Receive an unexpected message. Unexpected receives may wait on any tag and + * any source depending on the implementation. After completion, the user + * callback parameter is placed into the context completion queue and can be + * triggered using NA_Trigger(). + * The plugin_data parameter returned from the NA_Msg_buf_alloc() call must + * be passed along with the buffer, it allows plugins to store and retrieve + * additional buffer information such as memory descriptors. + * + * Users must manually create an operation ID through NA_Op_create() and pass + * it through op_id for future use and prevent multiple ID creation. + * + * \param na_class [IN/OUT] pointer to NA class + * \param context [IN/OUT] pointer to context of execution + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param buf [IN] pointer to send buffer + * \param buf_size [IN] buffer size + * \param plugin_data [IN] pointer to internal plugin data + * \param op_id [IN/OUT] pointer to operation ID + * + * \return NA_SUCCESS or corresponding NA error code + */ +static NA_INLINE na_return_t NA_Msg_recv_unexpected(na_class_t *na_class, na_context_t *context, + na_cb_t callback, void *arg, void *buf, + na_size_t buf_size, void *plugin_data, na_op_id_t *op_id); + +/** + * Initialize a buffer so that it can be safely passed to the + * NA_Msg_send_expected() call. In the case the underlying plugin adds its + * own header to that buffer, the header will be written at this time and the + * usable buffer payload will be buf + NA_Msg_get_expected_header_size(). + * + * \param na_class [IN/OUT] pointer to NA class + * \param buf [IN] pointer to buffer + * \param buf_size [IN] buffer size + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Msg_init_expected(na_class_t *na_class, void *buf, na_size_t buf_size); + +/** + * Send an expected message to dest_addr. After completion, the user callback is + * placed into the context completion queue and can be triggered using + * NA_Trigger(). + * The plugin_data parameter returned from the NA_Msg_buf_alloc() call must + * be passed along with the buffer, it allows plugins to store and retrieve + * additional buffer information such as memory descriptors. + * \remark Note that expected messages require an expected receive to be posted + * at the destination before sending the message, otherwise the destination is + * allowed to drop the message without notification. + * + * Users must manually create an operation ID through NA_Op_create() and pass + * it through op_id for future use and prevent multiple ID creation. + * + * \param na_class [IN/OUT] pointer to NA class + * \param context [IN/OUT] pointer to context of execution + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param buf [IN] pointer to send buffer + * \param buf_size [IN] buffer size + * \param plugin_data [IN] pointer to internal plugin data + * \param dest_addr [IN] abstract address of destination + * \param dest_id [IN] destination context ID + * \param tag [IN] tag attached to message + * \param op_id [IN/OUT] pointer to operation ID + * + * \return NA_SUCCESS or corresponding NA error code + */ +static NA_INLINE na_return_t NA_Msg_send_expected(na_class_t *na_class, na_context_t *context, + na_cb_t callback, void *arg, const void *buf, + na_size_t buf_size, void *plugin_data, na_addr_t dest_addr, + na_uint8_t dest_id, na_tag_t tag, na_op_id_t *op_id); + +/** + * Receive an expected message from source_addr. After completion, the user + * callback is placed into the context completion queue and can be triggered + * using NA_Trigger(). + * The plugin_data parameter returned from the NA_Msg_buf_alloc() call must + * be passed along with the buffer, it allows plugins to store and retrieve + * additional buffer information such as memory descriptors. + * + * Users must manually create an operation ID through NA_Op_create() and pass + * it through op_id for future use and prevent multiple ID creation. + * + * \param na_class [IN/OUT] pointer to NA class + * \param context [IN/OUT] pointer to context of execution + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param buf [IN] pointer to receive buffer + * \param buf_size [IN] buffer size + * \param plugin_data [IN] pointer to internal plugin data + * \param source_addr [IN] abstract address of source + * \param source_id [IN] source context ID + * \param tag [IN] matching tag used to receive message + * \param op_id [IN/OUT] pointer to operation ID + * + * \return NA_SUCCESS or corresponding NA error code + */ +static NA_INLINE na_return_t NA_Msg_recv_expected(na_class_t *na_class, na_context_t *context, + na_cb_t callback, void *arg, void *buf, na_size_t buf_size, + void *plugin_data, na_addr_t source_addr, + na_uint8_t source_id, na_tag_t tag, na_op_id_t *op_id); + +/** + * Create memory handle for RMA operations. + * For non-contiguous memory, use NA_Mem_handle_create_segments() instead. + * + * \remark Note to plugin developers: NA_Mem_handle_create() may be called + * multiple times on the same memory region. + * + * \param na_class [IN/OUT] pointer to NA class + * \param buf [IN] pointer to buffer that needs to be registered + * \param buf_size [IN] buffer size + * \param flags [IN] permission flag: + * - NA_MEM_READWRITE + * - NA_MEM_READ_ONLY + * \param mem_handle [OUT] pointer to returned abstract memory handle + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Mem_handle_create(na_class_t *na_class, void *buf, na_size_t buf_size, + unsigned long flags, na_mem_handle_t *mem_handle); + +/** + * Create memory handle for RMA operations. + * Create_segments can be used to register scatter-gather lists and get a single + * memory handle. + * \remark Implemented only if the network transport or hardware supports it. + * + * \param na_class [IN/OUT] pointer to NA class + * \param segments [IN] pointer to array of segments composed of: + * - address of the segment that needs to be + * registered + * - size of the segment in bytes + * \param segment_count [IN] segment count + * \param flags [IN] permission flag: + * - NA_MEM_READWRITE + * - NA_MEM_READ_ONLY + * \param mem_handle [OUT] pointer to returned abstract memory handle + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Mem_handle_create_segments(na_class_t *na_class, struct na_segment *segments, + na_size_t segment_count, unsigned long flags, + na_mem_handle_t *mem_handle); + +/** + * Free memory handle. + * + * \param na_class [IN/OUT] pointer to NA class + * \param mem_handle [IN] abstract memory handle + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Mem_handle_free(na_class_t *na_class, na_mem_handle_t mem_handle); + +/** + * Get the maximum segment count that can be passed to + * NA_Mem_handle_create_segments(). + * + * \param na_class [IN] pointer to NA class + * + * \return Non-negative value + */ +static NA_INLINE na_size_t NA_Mem_handle_get_max_segments(const na_class_t *na_class) NA_WARN_UNUSED_RESULT; + +/** + * Register memory for RMA operations. + * Memory pieces must be registered before one-sided transfers can be + * initiated. + * + * \param na_class [IN/OUT] pointer to NA class + * \param mem_handle [IN] pointer to abstract memory handle + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Mem_register(na_class_t *na_class, na_mem_handle_t mem_handle); + +/** + * Unregister memory. + * + * \param na_class [IN/OUT] pointer to NA class + * \param mem_handle [IN] abstract memory handle + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Mem_deregister(na_class_t *na_class, na_mem_handle_t mem_handle); + +/** + * Get size required to serialize handle. + * + * \param na_class [IN/OUT] pointer to NA class + * \param mem_handle [IN] abstract memory handle + * + * \return Non-negative value + */ +static NA_INLINE na_size_t NA_Mem_handle_get_serialize_size(na_class_t * na_class, + na_mem_handle_t mem_handle) NA_WARN_UNUSED_RESULT; + +/** + * Serialize memory handle into a buffer. + * One-sided transfers require prior exchange of memory handles between + * peers, serialization callbacks can be used to "pack" a memory handle and + * send it across the network. + * \remark Memory handles can be variable size, therefore the space required + * to serialize a handle into a buffer can be obtained using + * NA_Mem_handle_get_serialize_size(). + * + * \param na_class [IN/OUT] pointer to NA class + * \param buf [IN/OUT] pointer to buffer used for serialization + * \param buf_size [IN] buffer size + * \param mem_handle [IN] abstract memory handle + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Mem_handle_serialize(na_class_t *na_class, void *buf, na_size_t buf_size, + na_mem_handle_t mem_handle); + +/** + * Deserialize memory handle from buffer. + * + * \param na_class [IN/OUT] pointer to NA class + * \param mem_handle [OUT] pointer to abstract memory handle + * \param buf [IN] pointer to buffer used for deserialization + * \param buf_size [IN] buffer size + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Mem_handle_deserialize(na_class_t *na_class, na_mem_handle_t *mem_handle, + const void *buf, na_size_t buf_size); + +/** + * Put data to remote address. + * Initiate a put to the registered memory regions with the given offset/size. + * After completion, the user callback is placed into a completion queue and + * can be triggered using NA_Trigger(). + * \remark Memory must be registered and handles exchanged between peers. + * + * Users must manually create an operation ID through NA_Op_create() and pass + * it through op_id for future use and prevent multiple ID creation. + * + * \param na_class [IN/OUT] pointer to NA class + * \param context [IN/OUT] pointer to context of execution + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param local_mem_handle [IN] abstract local memory handle + * \param local_offset [IN] local offset + * \param remote_mem_handle [IN] abstract remote memory handle + * \param remote_offset [IN] remote offset + * \param data_size [IN] size of data that needs to be transferred + * \param remote_addr [IN] abstract address of remote destination + * \param remote_id [IN] target ID of remote destination + * \param op_id [IN/OUT] pointer to operation ID + * + * \return NA_SUCCESS or corresponding NA error code + */ +static NA_INLINE na_return_t NA_Put(na_class_t *na_class, na_context_t *context, na_cb_t callback, void *arg, + na_mem_handle_t local_mem_handle, na_offset_t local_offset, + na_mem_handle_t remote_mem_handle, na_offset_t remote_offset, + na_size_t data_size, na_addr_t remote_addr, na_uint8_t remote_id, + na_op_id_t *op_id); + +/** + * Get data from remote address. + * Initiate a get to the registered memory regions with the given offset/size. + * After completion, the user callback is placed into a completion queue and + * can be triggered using NA_Trigger(). + * + * Users must manually create an operation ID through NA_Op_create() and pass + * it through op_id for future use and prevent multiple ID creation. + * + * \param na_class [IN/OUT] pointer to NA class + * \param context [IN/OUT] pointer to context of execution + * \param callback [IN] pointer to function callback + * \param arg [IN] pointer to data passed to callback + * \param local_mem_handle [IN] abstract local memory handle + * \param local_offset [IN] local offset + * \param remote_mem_handle [IN] abstract remote memory handle + * \param remote_offset [IN] remote offset + * \param data_size [IN] size of data that needs to be transferred + * \param remote_addr [IN] abstract address of remote source + * \param remote_id [IN] target ID of remote source + * \param op_id [IN/OUT] pointer to operation ID + * + * \return NA_SUCCESS or corresponding NA error code + */ +static NA_INLINE na_return_t NA_Get(na_class_t *na_class, na_context_t *context, na_cb_t callback, void *arg, + na_mem_handle_t local_mem_handle, na_offset_t local_offset, + na_mem_handle_t remote_mem_handle, na_offset_t remote_offset, + na_size_t data_size, na_addr_t remote_addr, na_uint8_t remote_id, + na_op_id_t *op_id); + +/** + * Retrieve file descriptor from NA plugin when supported. The descriptor + * can be used by upper layers for manual polling through the usual + * OS select/poll/epoll calls. + * + * \param na_class [IN/OUT] pointer to NA class + * \param context [IN/OUT] pointer to context of execution + * + * \return Non-negative integer if supported, 0 if not implemented and negative + * in case of error. + */ +static NA_INLINE int NA_Poll_get_fd(na_class_t *na_class, na_context_t *context) NA_WARN_UNUSED_RESULT; + +/** + * Used to signal when it is safe to block on the class/context poll descriptor + * or if there is already work that can be progressed. + * + * \param na_class [IN/OUT] pointer to NA class + * \param context [IN/OUT] pointer to context of execution + * + * \return NA_TRUE if it is safe to block or NA_FALSE otherwise + */ +NA_PUBLIC na_bool_t NA_Poll_try_wait(na_class_t *na_class, na_context_t *context); + +/** + * Try to progress communication for at most timeout until timeout is reached or + * any completion has occurred. + * Progress should not be considered as wait, in the sense that it cannot be + * assumed that completion of a specific operation will occur only when + * progress is called. + * + * \param na_class [IN/OUT] pointer to NA class + * \param context [IN/OUT] pointer to context of execution + * \param timeout [IN] timeout (in milliseconds) + * + * \return NA_SUCCESS if any completion has occurred / NA error code otherwise + */ +NA_PUBLIC na_return_t NA_Progress(na_class_t *na_class, na_context_t *context, unsigned int timeout); + +/** + * Execute at most max_count callbacks. If timeout is non-zero, wait up to + * timeout before returning. Function can return when at least one or more + * callbacks are triggered (at most max_count). + * + * \param context [IN/OUT] pointer to context of execution + * \param timeout [IN] timeout (in milliseconds) + * \param max_count [IN] maximum number of callbacks triggered + * \param callback_ret [IN/OUT] array of callback return values + * \param actual_count [OUT] actual number of callbacks triggered + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Trigger(na_context_t *context, unsigned int timeout, unsigned int max_count, + int callback_ret[], unsigned int *actual_count); + +/** + * Cancel an ongoing operation. + * + * \param na_class [IN/OUT] pointer to NA class + * \param context [IN/OUT] pointer to context of execution + * \param op_id [IN] pointer to operation ID + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_Cancel(na_class_t *na_class, na_context_t *context, na_op_id_t *op_id); + +/** + * Convert error return code to string (null terminated). + * + * \param errnum [IN] error return code + * + * \return String + */ +NA_PUBLIC const char *NA_Error_to_string(na_return_t errnum) NA_WARN_UNUSED_RESULT; + +/************************************/ +/* Local Type and Struct Definition */ +/************************************/ + +/* NA info definition */ +struct na_info { + char *class_name; /* Class name (e.g., bmi) */ + char *protocol_name; /* Protocol (e.g., tcp, ib) */ + char *host_name; /* Host (may be NULL in anonymous mode) */ + /* Additional init info (NULL if no info) */ + const struct na_init_info *na_init_info; +}; + +/* NA class definition */ +struct na_class { + const struct na_class_ops *ops; /* Class operations */ + void * plugin_class; /* Plugin private class */ + char * protocol_name; /* Name of protocol */ + na_uint32_t progress_mode; /* NA progress mode */ + na_bool_t listen; /* Listen for connections */ +}; + +/* NA context definition */ +struct na_context { + void *plugin_context; /* Plugin private context */ +}; + +/* NA plugin callbacks */ +struct na_class_ops { + const char *class_name; + na_bool_t (*check_protocol)(const char *protocol_name); + na_return_t (*initialize)(na_class_t *na_class, const struct na_info *na_info, na_bool_t listen); + na_return_t (*finalize)(na_class_t *na_class); + void (*cleanup)(void); + na_return_t (*context_create)(na_class_t *na_class, void **plugin_context, na_uint8_t id); + na_return_t (*context_destroy)(na_class_t *na_class, void *plugin_context); + na_op_id_t *(*op_create)(na_class_t *na_class); + na_return_t (*op_destroy)(na_class_t *na_class, na_op_id_t *op_id); + na_return_t (*addr_lookup)(na_class_t *na_class, const char *name, na_addr_t *addr); + na_return_t (*addr_free)(na_class_t *na_class, na_addr_t addr); + na_return_t (*addr_set_remove)(na_class_t *na_class, na_addr_t addr); + na_return_t (*addr_self)(na_class_t *na_class, na_addr_t *addr); + na_return_t (*addr_dup)(na_class_t *na_class, na_addr_t addr, na_addr_t *new_addr); + na_bool_t (*addr_cmp)(na_class_t *na_class, na_addr_t addr1, na_addr_t addr2); + na_bool_t (*addr_is_self)(na_class_t *na_class, na_addr_t addr); + na_return_t (*addr_to_string)(na_class_t *na_class, char *buf, na_size_t *buf_size, na_addr_t addr); + na_size_t (*addr_get_serialize_size)(na_class_t *na_class, na_addr_t addr); + na_return_t (*addr_serialize)(na_class_t *na_class, void *buf, na_size_t buf_size, na_addr_t addr); + na_return_t (*addr_deserialize)(na_class_t *na_class, na_addr_t *addr, const void *buf, + na_size_t buf_size); + na_size_t (*msg_get_max_unexpected_size)(const na_class_t *na_class); + na_size_t (*msg_get_max_expected_size)(const na_class_t *na_class); + na_size_t (*msg_get_unexpected_header_size)(const na_class_t *na_class); + na_size_t (*msg_get_expected_header_size)(const na_class_t *na_class); + na_tag_t (*msg_get_max_tag)(const na_class_t *na_class); + void *(*msg_buf_alloc)(na_class_t *na_class, na_size_t buf_size, void **plugin_data); + na_return_t (*msg_buf_free)(na_class_t *na_class, void *buf, void *plugin_data); + na_return_t (*msg_init_unexpected)(na_class_t *na_class, void *buf, na_size_t buf_size); + na_return_t (*msg_send_unexpected)(na_class_t *na_class, na_context_t *context, na_cb_t callback, + void *arg, const void *buf, na_size_t buf_size, void *plugin_data, + na_addr_t dest_addr, na_uint8_t dest_id, na_tag_t tag, + na_op_id_t *op_id); + na_return_t (*msg_recv_unexpected)(na_class_t *na_class, na_context_t *context, na_cb_t callback, + void *arg, void *buf, na_size_t buf_size, void *plugin_data, + na_op_id_t *op_id); + na_return_t (*msg_init_expected)(na_class_t *na_class, void *buf, na_size_t buf_size); + na_return_t (*msg_send_expected)(na_class_t *na_class, na_context_t *context, na_cb_t callback, void *arg, + const void *buf, na_size_t buf_size, void *plugin_data, + na_addr_t dest_addr, na_uint8_t dest_id, na_tag_t tag, + na_op_id_t *op_id); + na_return_t (*msg_recv_expected)(na_class_t *na_class, na_context_t *context, na_cb_t callback, void *arg, + void *buf, na_size_t buf_size, void *plugin_data, na_addr_t source_addr, + na_uint8_t source_id, na_tag_t tag, na_op_id_t *op_id); + na_return_t (*mem_handle_create)(na_class_t *na_class, void *buf, na_size_t buf_size, unsigned long flags, + na_mem_handle_t *mem_handle); + na_return_t (*mem_handle_create_segments)(na_class_t *na_class, struct na_segment *segments, + na_size_t segment_count, unsigned long flags, + na_mem_handle_t *mem_handle); + na_return_t (*mem_handle_free)(na_class_t *na_class, na_mem_handle_t mem_handle); + na_size_t (*mem_handle_get_max_segments)(const na_class_t *na_class); + na_return_t (*mem_register)(na_class_t *na_class, na_mem_handle_t mem_handle); + na_return_t (*mem_deregister)(na_class_t *na_class, na_mem_handle_t mem_handle); + na_size_t (*mem_handle_get_serialize_size)(na_class_t *na_class, na_mem_handle_t mem_handle); + na_return_t (*mem_handle_serialize)(na_class_t *na_class, void *buf, na_size_t buf_size, + na_mem_handle_t mem_handle); + na_return_t (*mem_handle_deserialize)(na_class_t *na_class, na_mem_handle_t *mem_handle, const void *buf, + na_size_t buf_size); + na_return_t (*put)(na_class_t *na_class, na_context_t *context, na_cb_t callback, void *arg, + na_mem_handle_t local_mem_handle, na_offset_t local_offset, + na_mem_handle_t remote_mem_handle, na_offset_t remote_offset, na_size_t length, + na_addr_t remote_addr, na_uint8_t remote_id, na_op_id_t *op_id); + na_return_t (*get)(na_class_t *na_class, na_context_t *context, na_cb_t callback, void *arg, + na_mem_handle_t local_mem_handle, na_offset_t local_offset, + na_mem_handle_t remote_mem_handle, na_offset_t remote_offset, na_size_t length, + na_addr_t remote_addr, na_uint8_t remote_id, na_op_id_t *op_id); + int (*na_poll_get_fd)(na_class_t *na_class, na_context_t *context); + na_bool_t (*na_poll_try_wait)(na_class_t *na_class, na_context_t *context); + na_return_t (*progress)(na_class_t *na_class, na_context_t *context, unsigned int timeout); + na_return_t (*cancel)(na_class_t *na_class, na_context_t *context, na_op_id_t *op_id); +}; + +/*---------------------------------------------------------------------------*/ +static NA_INLINE const char * +NA_Get_class_name(const na_class_t *na_class) +{ + return na_class->ops->class_name; +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE const char * +NA_Get_class_protocol(const na_class_t *na_class) +{ + return na_class->protocol_name; +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_bool_t +NA_Is_listening(const na_class_t *na_class) +{ + return na_class->listen; +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_bool_t +NA_Addr_is_self(na_class_t *na_class, na_addr_t addr) +{ + return na_class->ops->addr_is_self(na_class, addr); +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_size_t +NA_Addr_get_serialize_size(na_class_t *na_class, na_addr_t addr) +{ + return (na_class->ops->addr_get_serialize_size) ? na_class->ops->addr_get_serialize_size(na_class, addr) + : 0; +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_size_t +NA_Msg_get_max_unexpected_size(const na_class_t *na_class) +{ + return na_class->ops->msg_get_max_unexpected_size(na_class); +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_size_t +NA_Msg_get_max_expected_size(const na_class_t *na_class) +{ + return na_class->ops->msg_get_max_expected_size(na_class); +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_size_t +NA_Msg_get_unexpected_header_size(const na_class_t *na_class) +{ + return (na_class->ops->msg_get_unexpected_header_size) + ? na_class->ops->msg_get_unexpected_header_size(na_class) + : 0; +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_size_t +NA_Msg_get_expected_header_size(const na_class_t *na_class) +{ + return (na_class->ops->msg_get_expected_header_size) + ? na_class->ops->msg_get_expected_header_size(na_class) + : 0; +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_tag_t +NA_Msg_get_max_tag(const na_class_t *na_class) +{ + return na_class->ops->msg_get_max_tag(na_class); +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_return_t +NA_Msg_send_unexpected(na_class_t *na_class, na_context_t *context, na_cb_t callback, void *arg, + const void *buf, na_size_t buf_size, void *plugin_data, na_addr_t dest_addr, + na_uint8_t dest_id, na_tag_t tag, na_op_id_t *op_id) +{ + return na_class->ops->msg_send_unexpected(na_class, context, callback, arg, buf, buf_size, plugin_data, + dest_addr, dest_id, tag, op_id); +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_return_t +NA_Msg_recv_unexpected(na_class_t *na_class, na_context_t *context, na_cb_t callback, void *arg, void *buf, + na_size_t buf_size, void *plugin_data, na_op_id_t *op_id) +{ + return na_class->ops->msg_recv_unexpected(na_class, context, callback, arg, buf, buf_size, plugin_data, + op_id); +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_return_t +NA_Msg_send_expected(na_class_t *na_class, na_context_t *context, na_cb_t callback, void *arg, + const void *buf, na_size_t buf_size, void *plugin_data, na_addr_t dest_addr, + na_uint8_t dest_id, na_tag_t tag, na_op_id_t *op_id) +{ + return na_class->ops->msg_send_expected(na_class, context, callback, arg, buf, buf_size, plugin_data, + dest_addr, dest_id, tag, op_id); +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_return_t +NA_Msg_recv_expected(na_class_t *na_class, na_context_t *context, na_cb_t callback, void *arg, void *buf, + na_size_t buf_size, void *plugin_data, na_addr_t source_addr, na_uint8_t source_id, + na_tag_t tag, na_op_id_t *op_id) +{ + return na_class->ops->msg_recv_expected(na_class, context, callback, arg, buf, buf_size, plugin_data, + source_addr, source_id, tag, op_id); +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_size_t +NA_Mem_handle_get_max_segments(const na_class_t *na_class) +{ + return (na_class->ops->mem_handle_get_max_segments) ? na_class->ops->mem_handle_get_max_segments(na_class) + : 1; +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_size_t +NA_Mem_handle_get_serialize_size(na_class_t *na_class, na_mem_handle_t mem_handle) +{ + return na_class->ops->mem_handle_get_serialize_size(na_class, mem_handle); +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_return_t +NA_Put(na_class_t *na_class, na_context_t *context, na_cb_t callback, void *arg, + na_mem_handle_t local_mem_handle, na_offset_t local_offset, na_mem_handle_t remote_mem_handle, + na_offset_t remote_offset, na_size_t data_size, na_addr_t remote_addr, na_uint8_t remote_id, + na_op_id_t *op_id) +{ + return na_class->ops->put(na_class, context, callback, arg, local_mem_handle, local_offset, + remote_mem_handle, remote_offset, data_size, remote_addr, remote_id, op_id); +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE na_return_t +NA_Get(na_class_t *na_class, na_context_t *context, na_cb_t callback, void *arg, + na_mem_handle_t local_mem_handle, na_offset_t local_offset, na_mem_handle_t remote_mem_handle, + na_offset_t remote_offset, na_size_t data_size, na_addr_t remote_addr, na_uint8_t remote_id, + na_op_id_t *op_id) +{ + return na_class->ops->get(na_class, context, callback, arg, local_mem_handle, local_offset, + remote_mem_handle, remote_offset, data_size, remote_addr, remote_id, op_id); +} + +/*---------------------------------------------------------------------------*/ +static NA_INLINE int +NA_Poll_get_fd(na_class_t *na_class, na_context_t *context) +{ + return (na_class->ops->na_poll_get_fd) ? na_class->ops->na_poll_get_fd(na_class, context) : -1; +} + +#ifdef __cplusplus +} +#endif + +#endif /* NA_H */ diff --git a/src/mercury/include/na_config.h b/src/mercury/include/na_config.h new file mode 100644 index 00000000000..579ba63d2d0 --- /dev/null +++ b/src/mercury/include/na_config.h @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Generated file. Only edit na_config.h.in. */ + +#ifndef NA_CONFIG_H +#define NA_CONFIG_H + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/* Type definitions */ +#ifdef _WIN32 +typedef signed __int64 na_int64_t; +typedef signed __int32 na_int32_t; +typedef signed __int16 na_int16_t; +typedef signed __int8 na_int8_t; +typedef unsigned __int64 na_uint64_t; +typedef unsigned __int32 na_uint32_t; +typedef unsigned __int16 na_uint16_t; +typedef unsigned __int8 na_uint8_t; +#else +#include +#include +typedef int64_t na_int64_t; +typedef int32_t na_int32_t; +typedef int16_t na_int16_t; +typedef int8_t na_int8_t; +typedef uint64_t na_uint64_t; +typedef uint32_t na_uint32_t; +typedef uint16_t na_uint16_t; +typedef uint8_t na_uint8_t; +#endif +typedef na_uint8_t na_bool_t; +typedef na_uint64_t na_ptr_t; + +/* True / false */ +#define NA_TRUE 1 +#define NA_FALSE 0 + +/*****************/ +/* Public Macros */ +/*****************/ + +/* Visibility of symbols */ +#if defined(_WIN32) +#define NA_ABI_IMPORT __declspec(dllimport) +#define NA_ABI_EXPORT __declspec(dllexport) +#define NA_ABI_HIDDEN +#elif defined(__GNUC__) && (__GNUC__ >= 4) +#define NA_ABI_IMPORT __attribute__((visibility("default"))) +#define NA_ABI_EXPORT __attribute__((visibility("default"))) +#define NA_ABI_HIDDEN __attribute__((visibility("hidden"))) +#else +#define NA_ABI_IMPORT +#define NA_ABI_EXPORT +#define NA_ABI_HIDDEN +#endif + +/* Inline macro */ +#ifdef _WIN32 +#define NA_INLINE __inline +#else +#define NA_INLINE __inline__ +#endif + +/* Unused return values */ +#if defined(__GNUC__) +#define NA_WARN_UNUSED_RESULT __attribute__((warn_unused_result)) +#else +#define NA_WARN_UNUSED_RESULT +#endif + +/* Fallthrough macro */ +#if defined(__GNUC__) && (__GNUC__ >= 7) +#define NA_FALLTHROUGH() __attribute__((fallthrough)) +#else +#define NA_FALLTHROUGH() +#endif + +/* Shared libraries */ +/* #undef NA_BUILD_SHARED_LIBS */ +#ifdef NA_BUILD_SHARED_LIBS +#ifdef na_EXPORTS +#define NA_PUBLIC NA_ABI_EXPORT +#else +#define NA_PUBLIC NA_ABI_IMPORT +#endif +#define NA_PRIVATE NA_ABI_HIDDEN +#else +#define NA_PUBLIC +#define NA_PRIVATE +#endif + +/* Build Options */ +#define NA_HAS_MULTI_PROGRESS +/* #undef NA_HAS_DEBUG */ + +/* BMI */ +/* #undef NA_HAS_BMI */ + +/* MPI */ +/* #undef NA_HAS_MPI */ +/* #undef NA_MPI_HAS_GNI_SETUP */ + +/* CCI */ +/* #undef NA_HAS_CCI */ + +/* OFI */ +/* #undef NA_HAS_OFI */ +/* #undef NA_OFI_HAS_EXT_GNI_H */ +/* #undef NA_OFI_GNI_HAS_UDREG */ + +/* NA SM */ +#define NA_HAS_SM +/* #undef NA_SM_HAS_UUID */ +#define NA_SM_HAS_CMA +#define NA_SM_SHM_PREFIX "na_sm" +#define NA_SM_TMP_DIRECTORY "/tmp" + +/* UCX */ +/* #undef NA_HAS_UCX */ +/* #undef NA_UCX_HAS_LIB_QUERY */ +/* #undef NA_UCX_HAS_THREAD_MODE_NAMES */ + +#endif /* NA_CONFIG_H */ diff --git a/src/mercury/include/na_sm.h b/src/mercury/include/na_sm.h new file mode 100644 index 00000000000..3b1cd8d4af7 --- /dev/null +++ b/src/mercury/include/na_sm.h @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef NA_SM_H +#define NA_SM_H + +#include "na_types.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +#ifdef NA_SM_HAS_UUID +typedef unsigned char na_sm_id_t[16]; +#else +typedef long na_sm_id_t; +#endif + +/*****************/ +/* Public Macros */ +/*****************/ + +/* String length of Host ID */ +#ifdef NA_SM_HAS_UUID +#define NA_SM_HOST_ID_LEN 36 +#else +#define NA_SM_HOST_ID_LEN 11 +#endif + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Get the current host ID (generate a new one if none exists). + * + * \param id [IN/OUT] pointer to SM host ID + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_SM_Host_id_get(na_sm_id_t *id); + +/** + * Convert host ID to string. String size must be NA_SM_HOST_ID_LEN + 1. + * + * \param id [IN] SM host ID + * \param string [IN/OUT] pointer to string + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_SM_Host_id_to_string(na_sm_id_t id, char *string); + +/** + * Convert string to host ID. String size must be NA_SM_HOST_ID_LEN + 1. + * + * \param string [IN] pointer to string + * \param id [IN/OUT] pointer to SM host ID + * + * \return NA_SUCCESS or corresponding NA error code + */ +NA_PUBLIC na_return_t NA_SM_String_to_host_id(const char *string, na_sm_id_t *id); + +/** + * Copy src host ID to dst. + * + * \param dst [IN/OUT] pointer to destination SM host ID + * \param src [IN] source SM host ID + */ +NA_PUBLIC void NA_SM_Host_id_copy(na_sm_id_t *dst, na_sm_id_t src); + +/** + * Compare two host IDs. + * + * \param id1 [IN] SM host ID + * \param id2 [IN] SM host ID + * + * \return NA_TRUE if equal or NA_FALSE otherwise + */ +NA_PUBLIC na_bool_t NA_SM_Host_id_cmp(na_sm_id_t id1, na_sm_id_t id2); + +#ifdef __cplusplus +} +#endif + +#endif /* NA_SM_H */ diff --git a/src/mercury/include/na_types.h b/src/mercury/include/na_types.h new file mode 100644 index 00000000000..0062ebe8894 --- /dev/null +++ b/src/mercury/include/na_types.h @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef NA_TYPES_H +#define NA_TYPES_H + +#include "na_config.h" + +#include + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +typedef struct na_class na_class_t; /* Opaque NA class */ +typedef struct na_context na_context_t; /* Opaque NA execution context */ +typedef struct na_addr * na_addr_t; /* Abstract NA address */ +typedef na_uint64_t na_size_t; /* Size */ +typedef na_uint32_t na_tag_t; /* Tag */ +typedef struct na_op_id na_op_id_t; /* Opaque operation id */ + +typedef struct na_mem_handle *na_mem_handle_t; /* Abstract memory handle */ +typedef na_uint64_t na_offset_t; /* Offset */ + +/* Init info */ +struct na_init_info { + /* Preferred IP subnet to use. */ + const char *ip_subnet; + + /* Authorization key that can be used for communication. All processes + * should use the same key in order to communicate. + * NB. generation of keys is done through third-party libraries. */ + const char *auth_key; + + /* Max unexpected size hint that can be passed to control the size of + * unexpected messages. Note that the underlying plugin library may switch + * to different transfer protocols depending on the message size that is + * used. */ + na_size_t max_unexpected_size; + + /* Max expected size hint that can be passed to control the size of + * expected messages. Note that the underlying plugin library may switch + * to different transfer protocols depending on the message size that is + * used. */ + na_size_t max_expected_size; + + /* Progress mode flag. Setting NA_NO_BLOCK will force busy-spin on progress + * and remove any wait/notification calls. */ + na_uint32_t progress_mode; + + /* Maximum number of contexts that are expected to be created. */ + na_uint8_t max_contexts; + + /* Thread mode flags can be used to relax thread-safety when it is not + * needed. When setting NA_THREAD_MODE_SINGLE, only a single thread should + * access both NA classes and contexts at a time. */ + na_uint8_t thread_mode; +}; + +/* Segment */ +struct na_segment { + na_ptr_t base; /* Address of the segment */ + na_size_t len; /* Size of the segment in bytes */ +}; + +/* Return codes: + * Functions return 0 for success or corresponding return code */ +#define NA_RETURN_VALUES \ + X(NA_SUCCESS) /*!< operation succeeded */ \ + X(NA_PERMISSION) /*!< operation not permitted */ \ + X(NA_NOENTRY) /*!< no such file or directory */ \ + X(NA_INTERRUPT) /*!< operation interrupted */ \ + X(NA_AGAIN) /*!< operation must be retried */ \ + X(NA_NOMEM) /*!< out of memory */ \ + X(NA_ACCESS) /*!< permission denied */ \ + X(NA_FAULT) /*!< bad address */ \ + X(NA_BUSY) /*!< device or resource busy */ \ + X(NA_EXIST) /*!< entry already exists */ \ + X(NA_NODEV) /*!< no such device */ \ + X(NA_INVALID_ARG) /*!< invalid argument */ \ + X(NA_PROTOCOL_ERROR) /*!< protocol error */ \ + X(NA_OVERFLOW) /*!< value too large */ \ + X(NA_MSGSIZE) /*!< message size too long */ \ + X(NA_PROTONOSUPPORT) /*!< protocol not supported */ \ + X(NA_OPNOTSUPPORTED) /*!< operation not supported on endpoint */ \ + X(NA_ADDRINUSE) /*!< address already in use */ \ + X(NA_ADDRNOTAVAIL) /*!< cannot assign requested address */ \ + X(NA_HOSTUNREACH) /*!< cannot reach host during operation */ \ + X(NA_TIMEOUT) /*!< operation reached timeout */ \ + X(NA_CANCELED) /*!< operation canceled */ \ + X(NA_RETURN_MAX) + +#define X(a) a, +typedef enum na_return { NA_RETURN_VALUES } na_return_t; +#undef X + +/* Callback operation type */ +#define NA_CB_TYPES \ + X(NA_CB_SEND_UNEXPECTED) /*!< unexpected send callback */ \ + X(NA_CB_RECV_UNEXPECTED) /*!< unexpected recv callback */ \ + X(NA_CB_SEND_EXPECTED) /*!< expected send callback */ \ + X(NA_CB_RECV_EXPECTED) /*!< expected recv callback */ \ + X(NA_CB_PUT) /*!< put callback */ \ + X(NA_CB_GET) /*!< get callback */ \ + X(NA_CB_MAX) + +#define X(a) a, +typedef enum na_cb_type { NA_CB_TYPES } na_cb_type_t; +#undef X + +/* Callback info structs */ +struct na_cb_info_recv_unexpected { + na_size_t actual_buf_size; + na_addr_t source; + na_tag_t tag; +}; + +struct na_cb_info_recv_expected { + na_size_t actual_buf_size; +}; + +/* Callback info struct */ +struct na_cb_info { + union { /* Union of callback info structures */ + struct na_cb_info_recv_unexpected recv_unexpected; + struct na_cb_info_recv_expected recv_expected; + } info; + void * arg; /* User data */ + na_cb_type_t type; /* Callback type */ + na_return_t ret; /* Return value */ +}; + +/* Callback type */ +typedef int (*na_cb_t)(const struct na_cb_info *callback_info); + +/*****************/ +/* Public Macros */ +/*****************/ + +/* Constant values */ +#define NA_ADDR_NULL ((na_addr_t)0) +#define NA_MEM_HANDLE_NULL ((na_mem_handle_t)0) + +/* Max timeout */ +#define NA_MAX_IDLE_TIME (3600 * 1000) + +/* Context ID max value + * \remark This is not the user limit but only the limit imposed by the type */ +#define NA_CONTEXT_ID_MAX UINT8_MAX + +/* Tag max value + * \remark This is not the user limit but only the limit imposed by the type */ +#define NA_TAG_MAX UINT_MAX + +/* The memory attributes associated with the memory handle + * can be defined as read only, write only or read/write */ +#define NA_MEM_READ_ONLY 0x01 +#define NA_MEM_WRITE_ONLY 0x02 +#define NA_MEM_READWRITE 0x03 + +/* Progress modes */ +#define NA_NO_BLOCK 0x01 /*!< no blocking progress */ +#define NA_NO_RETRY 0x02 /*!< no retry of operations in progress */ + +/* Thread modes (default is thread-safe) */ +#define NA_THREAD_MODE_SINGLE_CLS (0x01) /*!< only one thread will access class */ +#define NA_THREAD_MODE_SINGLE_CTX (0x02) /*!< only one thread will access context */ +#define NA_THREAD_MODE_SINGLE (NA_THREAD_MODE_SINGLE_CLS | NA_THREAD_MODE_SINGLE_CTX) + +/* NA init info initializer */ +#define NA_INIT_INFO_INITIALIZER \ + (struct na_init_info) \ + { \ + .ip_subnet = NULL, .auth_key = NULL, .max_unexpected_size = 0, .max_expected_size = 0, \ + .progress_mode = 0, .max_contexts = 1, .thread_mode = 0 \ + } + +#endif /* NA_TYPES_H */ diff --git a/src/mercury/src/util/CMake/FindOPA.cmake b/src/mercury/src/util/CMake/FindOPA.cmake new file mode 100644 index 00000000000..c9e1aae51ce --- /dev/null +++ b/src/mercury/src/util/CMake/FindOPA.cmake @@ -0,0 +1,31 @@ +# - Try to find OPA +# Once done this will define +# OPA_FOUND - System has OpenPA +# OPA_INCLUDE_DIRS - The OPA include directories +# OPA_LIBRARIES - The libraries needed to use OPA + +find_package(PkgConfig) +pkg_check_modules(PC_OPA QUIET openpa) +# If openpa.pc cannot be found, try to look for mpich2-c.pc +if(NOT PC_OPA_INCLUDEDIRS) + pkg_check_modules(PC_MPICH2_C QUIET mpich2-c) + set(PC_OPA_INCLUDEDIR ${PC_MPICH2_C_INCLUDEDIR}) + set(PC_OPA_LIBDIR ${PC_MPICH2_C_LIBDIR}) +endif() + +find_path(OPA_INCLUDE_DIR opa_primitives.h + HINTS ${PC_OPA_INCLUDEDIR} ${PC_OPA_INCLUDE_DIRS}) + +find_library(OPA_LIBRARY NAMES opa libopa + HINTS ${PC_OPA_LIBDIR} ${PC_OPA_LIBRARY_DIRS}) + +set(OPA_LIBRARIES ${OPA_LIBRARY}) +set(OPA_INCLUDE_DIRS ${OPA_INCLUDE_DIR}) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set OPA_FOUND to TRUE +# if all listed variables are TRUE +find_package_handle_standard_args(OPA DEFAULT_MSG + OPA_LIBRARY OPA_INCLUDE_DIR) + +mark_as_advanced(OPA_INCLUDE_DIR OPA_LIBRARY) diff --git a/src/mercury/src/util/CMakeLists.txt b/src/mercury/src/util/CMakeLists.txt new file mode 100644 index 00000000000..cf3621f241d --- /dev/null +++ b/src/mercury/src/util/CMakeLists.txt @@ -0,0 +1,274 @@ +#------------------------------------------------------------------------------ +# Setup cmake module +#------------------------------------------------------------------------------ +set(MERCURY_UTIL_CMAKE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/CMake") +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${MERCURY_UTIL_CMAKE_DIR}) + +#------------------------------------------------------------------------------ +# Include source and build directories +#------------------------------------------------------------------------------ +set(MERCURY_UTIL_BUILD_INCLUDE_DEPENDENCIES + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR} +) + +#------------------------------------------------------------------------------ +# External dependencies +#------------------------------------------------------------------------------ +include(CheckCSourceCompiles) +include(CheckIncludeFiles) +include(CheckSymbolExists) +include(CheckTypeSize) + +# Check for __attribute__((constructor)) +check_c_source_compiles( + " + static void test_constructor(void) __attribute__((constructor)); + int main(void) {return 0;} + " + HG_UTIL_HAS_ATTR_CONSTRUCTOR +) + +# Check for __attribute__((constructor(priority))) +check_c_source_compiles( + " + static void test_constructor(void) __attribute__((constructor(101))); + int main(void) {return 0;} + " + HG_UTIL_HAS_ATTR_CONSTRUCTOR_PRIORITY +) + +# Threads +set(CMAKE_THREAD_PREFER_PTHREAD TRUE) +set(THREADS_PREFER_PTHREAD_FLAG TRUE) +find_package(Threads REQUIRED) + +set(MERCURY_UTIL_EXT_LIB_DEPENDENCIES + ${MERCURY_UTIL_EXT_LIB_DEPENDENCIES} + ${CMAKE_THREAD_LIBS_INIT} +) +if(CMAKE_USE_PTHREADS_INIT) + set(CMAKE_EXTRA_INCLUDE_FILES pthread.h) + set(CMAKE_REQUIRED_LIBRARIES ${CMAKE_THREAD_LIBS_INIT}) + + # Detect pthread_spinlock_t + check_type_size(pthread_spinlock_t HG_UTIL_HAS_PTHREAD_SPINLOCK_T) + + # Use type size to check enum value + check_type_size(PTHREAD_MUTEX_ADAPTIVE_NP HG_UTIL_HAS_PTHREAD_MUTEX_ADAPTIVE_NP) + + # Detect pthread_condattr_setclock + check_symbol_exists(pthread_condattr_setclock pthread.h + HG_UTIL_HAS_PTHREAD_CONDATTR_SETCLOCK) + + unset(CMAKE_EXTRA_INCLUDE_FILES) + unset(CMAKE_REQUIRED_LIBRARIES) +endif() + +# Rt +if(NOT WIN32 AND NOT APPLE) + set(MERCURY_UTIL_EXT_LIB_DEPENDENCIES + ${MERCURY_UTIL_EXT_LIB_DEPENDENCIES} + -lrt + ) +endif() + +# Detect +check_include_files("time.h" HG_UTIL_HAS_TIME_H) +if(HG_UTIL_HAS_TIME_H) + set(CMAKE_EXTRA_INCLUDE_FILES time.h) + + # Detect clock_gettime + check_symbol_exists(clock_gettime time.h HG_UTIL_HAS_CLOCK_GETTIME) + + # Use type size to check enum value + check_type_size(CLOCK_MONOTONIC_COARSE HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE) + + unset(CMAKE_EXTRA_INCLUDE_FILES) +endif() + +# Debug +if(MERCURY_ENABLE_DEBUG) + set(HG_UTIL_HAS_DEBUG 1) +else() + set(HG_UTIL_HAS_DEBUG 0) +endif() + +# Detect +check_include_files("sys/time.h" HG_UTIL_HAS_SYSTIME_H) + +# Detect +check_include_files("sys/epoll.h" HG_UTIL_HAS_SYSEPOLL_H) + +# Detect +check_include_files("sys/eventfd.h" HG_UTIL_HAS_SYSEVENTFD_H) +if(HG_UTIL_HAS_SYSEVENTFD_H) + set(CMAKE_EXTRA_INCLUDE_FILES "sys/eventfd.h") + check_type_size(eventfd_t HG_UTIL_HAS_EVENTFD_T) +endif() + +# Detect +check_include_files("sys/event.h" HG_UTIL_HAS_SYSEVENT_H) + +# Atomics +if(NOT WIN32) + # Detect stdatomic + check_include_files("stdatomic.h" HG_UTIL_HAS_STDATOMIC_H) + # Detect size of atomic_long + set(CMAKE_EXTRA_INCLUDE_FILES stdatomic.h) + check_type_size(atomic_long HG_UTIL_ATOMIC_LONG_WIDTH) + unset(CMAKE_EXTRA_INCLUDE_FILES) + # OpenPA + option(MERCURY_USE_OPA "Use OpenPA for atomics." OFF) + # Force use of OPA if is not found + if(NOT HG_UTIL_HAS_STDATOMIC_H) + set(MERCURY_USE_OPA "ON" CACHE BOOL "Use OpenPA for atomics." FORCE) + endif() + mark_as_advanced(MERCURY_USE_OPA) + if(MERCURY_USE_OPA) + # Use OpenPA if stdatomic is not available + find_package(OPA REQUIRED) + message(STATUS "OPA include directory: ${OPA_INCLUDE_DIRS}") + set(HG_UTIL_HAS_OPA_PRIMITIVES_H 1) + set(MERCURY_UTIL_EXT_INCLUDE_DEPENDENCIES + ${MERCURY_UTIL_EXT_INCLUDE_DEPENDENCIES} + ${OPA_INCLUDE_DIRS} + ) + endif() +endif() + +# Colored output +option(MERCURY_ENABLE_LOG_COLOR "Use colored output for log." OFF) +if(MERCURY_ENABLE_LOG_COLOR) + set(HG_UTIL_HAS_LOG_COLOR 1) +endif() +mark_as_advanced(MERCURY_ENABLE_LOG_COLOR) + +#------------------------------------------------------------------------------ +# Configure module header files +#------------------------------------------------------------------------------ +# Set unique var used in the autogenerated config file (symbol import/export) +if(BUILD_SHARED_LIBS) + set(HG_UTIL_BUILD_SHARED_LIBS 1) + set(MERCURY_UTIL_LIBTYPE SHARED) +else() + set(HG_UTIL_BUILD_SHARED_LIBS 0) + set(MERCURY_UTIL_LIBTYPE STATIC) +endif() + +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_util_config.h.in + ${CMAKE_CURRENT_BINARY_DIR}/mercury_util_config.h +) + +#------------------------------------------------------------------------------ +# Set sources +#------------------------------------------------------------------------------ +set(MERCURY_UTIL_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_atomic_queue.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_dlog.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_event.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_hash_table.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_log.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_mem.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_mem_pool.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_poll.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_request.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread_condition.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread_mutex.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread_pool.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread_rwlock.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread_spin.c + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_util.c +) + +#------------------------------------------------------------------------------ +# Specify project public header files to be installed +#------------------------------------------------------------------------------ +set(MERCURY_UTIL_PUBLIC_HEADERS + ${CMAKE_CURRENT_BINARY_DIR}/mercury_util_config.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_atomic.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_atomic_queue.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_dlog.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_event.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_hash_string.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_hash_table.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_list.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_log.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_mem.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_mem_pool.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_poll.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_queue.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_request.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread_annotation.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread_condition.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread_mutex.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread_pool.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread_rwlock.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_thread_spin.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_time.h + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_util.h +) + +#------------------------------------------------------------------------------ +# Specify project private header files +#------------------------------------------------------------------------------ +set(MERCURY_UTIL_PRIVATE_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/mercury_util_error.h +) + +#---------------------------------------------------------------------------- +# Libraries +#---------------------------------------------------------------------------- + +# Clean up system include path first +foreach(item ${MERCURY_SYSTEM_INCLUDE_PATH}) + if(MERCURY_UTIL_EXT_INCLUDE_DEPENDENCIES) + list(REMOVE_ITEM MERCURY_UTIL_EXT_INCLUDE_DEPENDENCIES ${item}) + endif() +endforeach() + +# UTIL +add_library(mercury_util ${MERCURY_UTIL_SRCS} + ${MERCURY_UTIL_PRIVATE_HEADERS} ${MERCURY_UTIL_PUBLIC_HEADERS} +) +if(THREADS_HAVE_PTHREAD_ARG) + target_compile_options(mercury_util PUBLIC "${CMAKE_THREAD_LIBS_INIT}") +endif() +target_include_directories(mercury_util + PUBLIC "$" + $ +) +target_include_directories(mercury_util + SYSTEM PUBLIC ${MERCURY_UTIL_EXT_INCLUDE_DEPENDENCIES} +) +target_link_libraries(mercury_util ${MERCURY_UTIL_EXT_LIB_DEPENDENCIES}) +mercury_set_lib_options(mercury_util "mercury_util" ${MERCURY_UTIL_LIBTYPE}) +if(MERCURY_ENABLE_COVERAGE) + set_coverage_flags(mercury_util) +endif() +set_target_properties(mercury_util PROPERTIES + PUBLIC_HEADER "${MERCURY_UTIL_PUBLIC_HEADERS}" +) + +#--------------------------------------------------------------------------- +# Add Target(s) to CMake Install +#--------------------------------------------------------------------------- +install( + TARGETS + mercury_util + EXPORT + ${MERCURY_EXPORTED_TARGETS} + LIBRARY DESTINATION ${MERCURY_INSTALL_LIB_DIR} + ARCHIVE DESTINATION ${MERCURY_INSTALL_LIB_DIR} + PUBLIC_HEADER DESTINATION ${MERCURY_INSTALL_INCLUDE_DIR} + RUNTIME DESTINATION ${MERCURY_INSTALL_BIN_DIR} +) + +#------------------------------------------------------------------------------ +# Set variables for parent scope +#------------------------------------------------------------------------------ +set(MERCURY_UTIL_EXT_INCLUDE_DEPENDENCIES ${MERCURY_UTIL_EXT_INCLUDE_DEPENDENCIES} PARENT_SCOPE) +set(MERCURY_UTIL_EXT_LIB_DEPENDENCIES ${MERCURY_UTIL_EXT_LIB_DEPENDENCIES} PARENT_SCOPE) diff --git a/src/mercury/src/util/mercury_atomic.h b/src/mercury/src/util/mercury_atomic.h new file mode 100644 index 00000000000..d5a14171b28 --- /dev/null +++ b/src/mercury/src/util/mercury_atomic.h @@ -0,0 +1,625 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_ATOMIC_H +#define MERCURY_ATOMIC_H + +#include "mercury_util_config.h" + +#if defined(_WIN32) +#include +typedef struct { + volatile LONG value; +} hg_atomic_int32_t; +typedef struct { + volatile LONGLONG value; +} hg_atomic_int64_t; +#define HG_ATOMIC_VAR_INIT(x) \ + { \ + (x) \ + } +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) +#include +typedef OPA_int_t hg_atomic_int32_t; +typedef OPA_ptr_t hg_atomic_int64_t; /* OPA has only limited 64-bit support */ +#define HG_ATOMIC_VAR_INIT(x) OPA_PTR_T_INITIALIZER(x) +#elif defined(HG_UTIL_HAS_STDATOMIC_H) +#ifndef __cplusplus +#include +typedef atomic_int hg_atomic_int32_t; +#if (HG_UTIL_ATOMIC_LONG_WIDTH == 8) && !defined(__APPLE__) +typedef atomic_long hg_atomic_int64_t; +#else +typedef atomic_llong hg_atomic_int64_t; +#endif +#else +#include +typedef std::atomic_int hg_atomic_int32_t; +#if (HG_UTIL_ATOMIC_LONG_WIDTH == 8) && !defined(__APPLE__) +typedef std::atomic_long hg_atomic_int64_t; +#else +typedef std::atomic_llong hg_atomic_int64_t; +#endif +using std::atomic_fetch_add_explicit; +using std::atomic_thread_fence; +using std::memory_order_acq_rel; +using std::memory_order_acquire; +using std::memory_order_release; +#endif +#define HG_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x) +#elif defined(__APPLE__) +#include +typedef struct { + volatile hg_util_int32_t value; +} hg_atomic_int32_t; +typedef struct { + volatile hg_util_int64_t value; +} hg_atomic_int64_t; +#define HG_ATOMIC_VAR_INIT(x) \ + { \ + (x) \ + } +#else +#error "Not supported on this platform." +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Init atomic value (32-bit integer). + * + * \param ptr [OUT] pointer to an atomic32 integer + * \param value [IN] value + */ +static HG_UTIL_INLINE void hg_atomic_init32(hg_atomic_int32_t *ptr, hg_util_int32_t value); + +/** + * Set atomic value (32-bit integer). + * + * \param ptr [OUT] pointer to an atomic32 integer + * \param value [IN] value + */ +static HG_UTIL_INLINE void hg_atomic_set32(hg_atomic_int32_t *ptr, hg_util_int32_t value); + +/** + * Get atomic value (32-bit integer). + * + * \param ptr [OUT] pointer to an atomic32 integer + * + * \return Value of the atomic integer + */ +static HG_UTIL_INLINE hg_util_int32_t hg_atomic_get32(hg_atomic_int32_t *ptr); + +/** + * Increment atomic value (32-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic32 integer + * + * \return Incremented value + */ +static HG_UTIL_INLINE hg_util_int32_t hg_atomic_incr32(hg_atomic_int32_t *ptr); + +/** + * Decrement atomic value (32-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic32 integer + * + * \return Decremented value + */ +static HG_UTIL_INLINE hg_util_int32_t hg_atomic_decr32(hg_atomic_int32_t *ptr); + +/** + * OR atomic value (32-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic32 integer + * \param value [IN] value to OR with + * + * \return Original value + */ +static HG_UTIL_INLINE hg_util_int32_t hg_atomic_or32(hg_atomic_int32_t *ptr, hg_util_int32_t value); + +/** + * XOR atomic value (32-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic32 integer + * \param value [IN] value to XOR with + * + * \return Original value + */ +static HG_UTIL_INLINE hg_util_int32_t hg_atomic_xor32(hg_atomic_int32_t *ptr, hg_util_int32_t value); + +/** + * AND atomic value (32-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic32 integer + * \param value [IN] value to AND with + * + * \return Original value + */ +static HG_UTIL_INLINE hg_util_int32_t hg_atomic_and32(hg_atomic_int32_t *ptr, hg_util_int32_t value); + +/** + * Compare and swap values (32-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic32 integer + * \param compare_value [IN] value to compare to + * \param swap_value [IN] value to swap with if ptr value is equal to + * compare value + * + * \return HG_UTIL_TRUE if swapped or HG_UTIL_FALSE + */ +static HG_UTIL_INLINE hg_util_bool_t hg_atomic_cas32(hg_atomic_int32_t *ptr, hg_util_int32_t compare_value, + hg_util_int32_t swap_value); + +/** + * Init atomic value (64-bit integer). + * + * \param ptr [OUT] pointer to an atomic32 integer + * \param value [IN] value + */ +static HG_UTIL_INLINE void hg_atomic_init64(hg_atomic_int64_t *ptr, hg_util_int64_t value); + +/** + * Set atomic value (64-bit integer). + * + * \param ptr [OUT] pointer to an atomic64 integer + * \param value [IN] value + */ +static HG_UTIL_INLINE void hg_atomic_set64(hg_atomic_int64_t *ptr, hg_util_int64_t value); + +/** + * Get atomic value (64-bit integer). + * + * \param ptr [OUT] pointer to an atomic64 integer + * + * \return Value of the atomic integer + */ +static HG_UTIL_INLINE hg_util_int64_t hg_atomic_get64(hg_atomic_int64_t *ptr); + +/** + * Increment atomic value (64-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic64 integer + * + * \return Incremented value + */ +static HG_UTIL_INLINE hg_util_int64_t hg_atomic_incr64(hg_atomic_int64_t *ptr); + +/** + * Decrement atomic value (64-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic64 integer + * + * \return Decremented value + */ +static HG_UTIL_INLINE hg_util_int64_t hg_atomic_decr64(hg_atomic_int64_t *ptr); + +/** + * OR atomic value (64-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic64 integer + * \param value [IN] value to OR with + * + * \return Original value + */ +static HG_UTIL_INLINE hg_util_int64_t hg_atomic_or64(hg_atomic_int64_t *ptr, hg_util_int64_t value); + +/** + * XOR atomic value (64-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic64 integer + * \param value [IN] value to XOR with + * + * \return Original value + */ +static HG_UTIL_INLINE hg_util_int64_t hg_atomic_xor64(hg_atomic_int64_t *ptr, hg_util_int64_t value); + +/** + * AND atomic value (64-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic64 integer + * \param value [IN] value to AND with + * + * \return Original value + */ +static HG_UTIL_INLINE hg_util_int64_t hg_atomic_and64(hg_atomic_int64_t *ptr, hg_util_int64_t value); + +/** + * Compare and swap values (64-bit integer). + * + * \param ptr [IN/OUT] pointer to an atomic64 integer + * \param compare_value [IN] value to compare to + * \param swap_value [IN] value to swap with if ptr value is equal to + * compare value + * + * \return HG_UTIL_TRUE if swapped or HG_UTIL_FALSE + */ +static HG_UTIL_INLINE hg_util_bool_t hg_atomic_cas64(hg_atomic_int64_t *ptr, hg_util_int64_t compare_value, + hg_util_int64_t swap_value); + +/** + * Memory barrier. + * + */ +static HG_UTIL_INLINE void hg_atomic_fence(void); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_atomic_init32(hg_atomic_int32_t *ptr, hg_util_int32_t value) +{ +#if defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + atomic_init(ptr, value); +#else + hg_atomic_set32(ptr, value); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_atomic_set32(hg_atomic_int32_t *ptr, hg_util_int32_t value) +{ +#if defined(_WIN32) + ptr->value = value; +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + OPA_store_int(ptr, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + atomic_store_explicit(ptr, value, memory_order_release); +#elif defined(__APPLE__) + ptr->value = value; +#else +#error "Not supported on this platform." +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int32_t +hg_atomic_get32(hg_atomic_int32_t *ptr) +{ + hg_util_int32_t ret; + +#if defined(_WIN32) + ret = ptr->value; +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = OPA_load_int(ptr); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + ret = atomic_load_explicit(ptr, memory_order_acquire); +#elif defined(__APPLE__) + ret = ptr->value; +#else +#error "Not supported on this platform." +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int32_t +hg_atomic_incr32(hg_atomic_int32_t *ptr) +{ + hg_util_int32_t ret; + +#if defined(_WIN32) + ret = InterlockedIncrementNoFence(&ptr->value); +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = OPA_fetch_and_incr_int(ptr) + 1; +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + ret = atomic_fetch_add_explicit(ptr, 1, memory_order_acq_rel) + 1; +#elif defined(__APPLE__) + ret = OSAtomicIncrement32(&ptr->value); +#else +#error "Not supported on this platform." +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int32_t +hg_atomic_decr32(hg_atomic_int32_t *ptr) +{ + hg_util_int32_t ret; + +#if defined(_WIN32) + ret = InterlockedDecrementNoFence(&ptr->value); +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = OPA_fetch_and_decr_int(ptr) - 1; +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + ret = atomic_fetch_sub_explicit(ptr, 1, memory_order_acq_rel) - 1; +#elif defined(__APPLE__) + ret = OSAtomicDecrement32(&ptr->value); +#else +#error "Not supported on this platform." +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int32_t +hg_atomic_or32(hg_atomic_int32_t *ptr, hg_util_int32_t value) +{ + hg_util_int32_t ret; + +#if defined(_WIN32) + ret = InterlockedOrNoFence(&ptr->value, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_or_explicit(ptr, value, memory_order_acq_rel); +#elif defined(__APPLE__) + ret = OSAtomicOr32Orig((uint32_t)value, (volatile uint32_t *)&ptr->value); +#else + do { + ret = hg_atomic_get32(ptr); + } while (!hg_atomic_cas32(ptr, ret, (ret | value))); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int32_t +hg_atomic_xor32(hg_atomic_int32_t *ptr, hg_util_int32_t value) +{ + hg_util_int32_t ret; + +#if defined(_WIN32) + ret = InterlockedXorNoFence(&ptr->value, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_xor_explicit(ptr, value, memory_order_acq_rel); +#elif defined(__APPLE__) + ret = OSAtomicXor32Orig((uint32_t)value, (volatile uint32_t *)&ptr->value); +#else + do { + ret = hg_atomic_get32(ptr); + } while (!hg_atomic_cas32(ptr, ret, (ret ^ value))); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int32_t +hg_atomic_and32(hg_atomic_int32_t *ptr, hg_util_int32_t value) +{ + hg_util_int32_t ret; + +#if defined(_WIN32) + ret = InterlockedAndNoFence(&ptr->value, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_and_explicit(ptr, value, memory_order_acq_rel); +#elif defined(__APPLE__) + ret = OSAtomicAnd32Orig((uint32_t)value, (volatile uint32_t *)&ptr->value); +#else + do { + ret = hg_atomic_get32(ptr); + } while (!hg_atomic_cas32(ptr, ret, (ret & value))); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_bool_t +hg_atomic_cas32(hg_atomic_int32_t *ptr, hg_util_int32_t compare_value, hg_util_int32_t swap_value) +{ + hg_util_bool_t ret; + +#if defined(_WIN32) + ret = (compare_value == InterlockedCompareExchangeNoFence(&ptr->value, swap_value, compare_value)); +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = (hg_util_bool_t)(compare_value == OPA_cas_int(ptr, compare_value, swap_value)); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + ret = atomic_compare_exchange_strong_explicit(ptr, &compare_value, swap_value, memory_order_acq_rel, + memory_order_acquire); +#elif defined(__APPLE__) + ret = OSAtomicCompareAndSwap32(compare_value, swap_value, &ptr->value); +#else +#error "Not supported on this platform." +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_atomic_init64(hg_atomic_int64_t *ptr, hg_util_int64_t value) +{ +#if defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + atomic_init(ptr, value); +#else + hg_atomic_set64(ptr, value); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_atomic_set64(hg_atomic_int64_t *ptr, hg_util_int64_t value) +{ +#if defined(_WIN32) + ptr->value = value; +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + OPA_store_ptr(ptr, (void *)value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + atomic_store_explicit(ptr, value, memory_order_release); +#elif defined(__APPLE__) + ptr->value = value; +#else +#error "Not supported on this platform." +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int64_t +hg_atomic_get64(hg_atomic_int64_t *ptr) +{ + hg_util_int64_t ret; + +#if defined(_WIN32) + ret = ptr->value; +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = (hg_util_int64_t)OPA_load_ptr(ptr); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + ret = atomic_load_explicit(ptr, memory_order_acquire); +#elif defined(__APPLE__) + ptr->value = value; +#else +#error "Not supported on this platform." +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int64_t +hg_atomic_incr64(hg_atomic_int64_t *ptr) +{ + hg_util_int64_t ret; + +#if defined(_WIN32) + ret = InterlockedIncrementNoFence64(&ptr->value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_add_explicit(ptr, 1L, memory_order_acq_rel) + 1; +#elif defined(__APPLE__) + ret = OSAtomicIncrement64(&ptr->value); +#else + do { + ret = hg_atomic_get64(ptr); + } while (!hg_atomic_cas64(ptr, ret, ret + 1)); + ret++; +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int64_t +hg_atomic_decr64(hg_atomic_int64_t *ptr) +{ + hg_util_int64_t ret; + +#if defined(_WIN32) + ret = InterlockedDecrementNoFence64(&ptr->value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_sub_explicit(ptr, 1L, memory_order_acq_rel) - 1; +#elif defined(__APPLE__) + ret = OSAtomicDecrement64(&ptr->value); +#else + do { + ret = hg_atomic_get64(ptr); + } while (!hg_atomic_cas64(ptr, ret, ret - 1)); + ret--; +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int64_t +hg_atomic_or64(hg_atomic_int64_t *ptr, hg_util_int64_t value) +{ + hg_util_int64_t ret; + +#if defined(_WIN32) + ret = InterlockedOr64NoFence(&ptr->value, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_or_explicit(ptr, value, memory_order_acq_rel); +#else + do { + ret = hg_atomic_get64(ptr); + } while (!hg_atomic_cas64(ptr, ret, (ret | value))); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int64_t +hg_atomic_xor64(hg_atomic_int64_t *ptr, hg_util_int64_t value) +{ + hg_util_int64_t ret; + +#if defined(_WIN32) + ret = InterlockedXor64NoFence(&ptr->value, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_xor_explicit(ptr, value, memory_order_acq_rel); +#else + do { + ret = hg_atomic_get64(ptr); + } while (!hg_atomic_cas64(ptr, ret, (ret ^ value))); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_int64_t +hg_atomic_and64(hg_atomic_int64_t *ptr, hg_util_int64_t value) +{ + hg_util_int64_t ret; + +#if defined(_WIN32) + ret = InterlockedAnd64NoFence(&ptr->value, value); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) && !defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = atomic_fetch_and_explicit(ptr, value, memory_order_acq_rel); +#else + do { + ret = hg_atomic_get64(ptr); + } while (!hg_atomic_cas64(ptr, ret, (ret & value))); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_bool_t +hg_atomic_cas64(hg_atomic_int64_t *ptr, hg_util_int64_t compare_value, hg_util_int64_t swap_value) +{ + hg_util_bool_t ret; + +#if defined(_WIN32) + ret = (compare_value == InterlockedCompareExchangeNoFence64(&ptr->value, swap_value, compare_value)); +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + ret = (hg_util_bool_t)(compare_value == + (hg_util_int64_t)OPA_cas_ptr(ptr, (void *)compare_value, (void *)swap_value)); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + ret = atomic_compare_exchange_strong_explicit(ptr, &compare_value, swap_value, memory_order_acq_rel, + memory_order_acquire); +#elif defined(__APPLE__) + ret = OSAtomicCompareAndSwap64(compare_value, swap_value, &ptr->value); +#else +#error "Not supported on this platform." +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_atomic_fence() +{ +#if defined(_WIN32) + MemoryBarrier(); +#elif defined(HG_UTIL_HAS_OPA_PRIMITIVES_H) + OPA_read_write_barrier(); +#elif defined(HG_UTIL_HAS_STDATOMIC_H) + atomic_thread_fence(memory_order_acq_rel); +#elif defined(__APPLE__) + OSMemoryBarrier(); +#else +#error "Not supported on this platform." +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_ATOMIC_H */ diff --git a/src/mercury/src/util/mercury_atomic_queue.c b/src/mercury/src/util/mercury_atomic_queue.c new file mode 100644 index 00000000000..f76177b05ad --- /dev/null +++ b/src/mercury/src/util/mercury_atomic_queue.c @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Implementation derived from: + * https://github.com/freebsd/freebsd/blob/master/sys/sys/buf_ring.h + * + * - + * Copyright (c) 2007-2009 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include "mercury_atomic_queue.h" +#include "mercury_util_error.h" + +#include + +/****************/ +/* Local Macros */ +/****************/ + +/* From */ +#define powerof2(x) ((((x)-1) & (x)) == 0) + +/*---------------------------------------------------------------------------*/ +struct hg_atomic_queue * +hg_atomic_queue_alloc(unsigned int count) +{ + struct hg_atomic_queue *hg_atomic_queue = NULL; + + HG_UTIL_CHECK_ERROR_NORET(!powerof2(count), done, "atomic queue size must be power of 2"); + + hg_atomic_queue = hg_mem_aligned_alloc(HG_MEM_CACHE_LINE_SIZE, sizeof(struct hg_atomic_queue) + + count * sizeof(hg_atomic_int64_t)); + HG_UTIL_CHECK_ERROR_NORET(hg_atomic_queue == NULL, done, "Could not allocate atomic queue"); + + hg_atomic_queue->prod_size = hg_atomic_queue->cons_size = count; + hg_atomic_queue->prod_mask = hg_atomic_queue->cons_mask = count - 1; + hg_atomic_init32(&hg_atomic_queue->prod_head, 0); + hg_atomic_init32(&hg_atomic_queue->cons_head, 0); + hg_atomic_init32(&hg_atomic_queue->prod_tail, 0); + hg_atomic_init32(&hg_atomic_queue->cons_tail, 0); + +done: + return hg_atomic_queue; +} + +/*---------------------------------------------------------------------------*/ +void +hg_atomic_queue_free(struct hg_atomic_queue *hg_atomic_queue) +{ + hg_mem_aligned_free(hg_atomic_queue); +} diff --git a/src/mercury/src/util/mercury_atomic_queue.h b/src/mercury/src/util/mercury_atomic_queue.h new file mode 100644 index 00000000000..61b5128df1c --- /dev/null +++ b/src/mercury/src/util/mercury_atomic_queue.h @@ -0,0 +1,266 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Implementation derived from: + * https://github.com/freebsd/freebsd/blob/master/sys/sys/buf_ring.h + * + * - + * Copyright (c) 2007-2009 Kip Macy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#ifndef MERCURY_ATOMIC_QUEUE_H +#define MERCURY_ATOMIC_QUEUE_H + +#include "mercury_atomic.h" +#include "mercury_mem.h" + +/* For busy loop spinning */ +#ifndef cpu_spinwait +#if defined(_WIN32) +#define cpu_spinwait YieldProcessor +#elif defined(__x86_64__) || defined(__i386__) +#include +#define cpu_spinwait _mm_pause +#elif defined(__arm__) +#define cpu_spinwait() __asm__ __volatile__("yield") +#else +#warning "Processor yield is not supported on this architecture." +#define cpu_spinwait(x) +#endif +#endif + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +struct hg_atomic_queue { + hg_atomic_int32_t prod_head; + hg_atomic_int32_t prod_tail; + unsigned int prod_size; + unsigned int prod_mask; + hg_util_uint64_t drops; + hg_atomic_int32_t cons_head __attribute__((aligned(HG_MEM_CACHE_LINE_SIZE))); + hg_atomic_int32_t cons_tail; + unsigned int cons_size; + unsigned int cons_mask; + hg_atomic_int64_t ring[] __attribute__((aligned(HG_MEM_CACHE_LINE_SIZE))); +}; + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Allocate a new queue that can hold \count elements. + * + * \param count [IN] maximum number of elements + * + * \return pointer to allocated queue or NULL on failure + */ +HG_UTIL_PUBLIC struct hg_atomic_queue *hg_atomic_queue_alloc(unsigned int count); + +/** + * Free an existing queue. + * + * \param hg_atomic_queue [IN] pointer to queue + */ +HG_UTIL_PUBLIC void hg_atomic_queue_free(struct hg_atomic_queue *hg_atomic_queue); + +/** + * Push an entry to the queue. + * + * \param hg_atomic_queue [IN/OUT] pointer to queue + * \param entry [IN] pointer to object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_atomic_queue_push(struct hg_atomic_queue *hg_atomic_queue, void *entry); + +/** + * Pop an entry from the queue (multi-consumer). + * + * \param hg_atomic_queue [IN/OUT] pointer to queue + * + * \return Pointer to popped object or NULL if queue is empty + */ +static HG_UTIL_INLINE void *hg_atomic_queue_pop_mc(struct hg_atomic_queue *hg_atomic_queue); + +/** + * Pop an entry from the queue (single consumer). + * + * \param hg_atomic_queue [IN/OUT] pointer to queue + * + * \return Pointer to popped object or NULL if queue is empty + */ +static HG_UTIL_INLINE void *hg_atomic_queue_pop_sc(struct hg_atomic_queue *hg_atomic_queue); + +/** + * Determine whether queue is empty. + * + * \param hg_atomic_queue [IN/OUT] pointer to queue + * + * \return HG_UTIL_TRUE if empty, HG_UTIL_FALSE if not + */ +static HG_UTIL_INLINE hg_util_bool_t hg_atomic_queue_is_empty(struct hg_atomic_queue *hg_atomic_queue); + +/** + * Determine number of entries in a queue. + * + * \param hg_atomic_queue [IN/OUT] pointer to queue + * + * \return Number of entries queued or 0 if none + */ +static HG_UTIL_INLINE unsigned int hg_atomic_queue_count(struct hg_atomic_queue *hg_atomic_queue); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_atomic_queue_push(struct hg_atomic_queue *hg_atomic_queue, void *entry) +{ + hg_util_int32_t prod_head, prod_next, cons_tail; + + do { + prod_head = hg_atomic_get32(&hg_atomic_queue->prod_head); + prod_next = (prod_head + 1) & (int)hg_atomic_queue->prod_mask; + cons_tail = hg_atomic_get32(&hg_atomic_queue->cons_tail); + + if (prod_next == cons_tail) { + hg_atomic_fence(); + if (prod_head == hg_atomic_get32(&hg_atomic_queue->prod_head) && + cons_tail == hg_atomic_get32(&hg_atomic_queue->cons_tail)) { + hg_atomic_queue->drops++; + /* Full */ + return HG_UTIL_FAIL; + } + continue; + } + } while (!hg_atomic_cas32(&hg_atomic_queue->prod_head, prod_head, prod_next)); + + hg_atomic_set64(&hg_atomic_queue->ring[prod_head], (hg_util_int64_t)entry); + + /* + * If there are other enqueues in progress + * that preceded us, we need to wait for them + * to complete + */ + while (hg_atomic_get32(&hg_atomic_queue->prod_tail) != prod_head) + cpu_spinwait(); + + hg_atomic_set32(&hg_atomic_queue->prod_tail, prod_next); + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void * +hg_atomic_queue_pop_mc(struct hg_atomic_queue *hg_atomic_queue) +{ + hg_util_int32_t cons_head, cons_next; + void * entry = NULL; + + do { + cons_head = hg_atomic_get32(&hg_atomic_queue->cons_head); + cons_next = (cons_head + 1) & (int)hg_atomic_queue->cons_mask; + + if (cons_head == hg_atomic_get32(&hg_atomic_queue->prod_tail)) + return NULL; + } while (!hg_atomic_cas32(&hg_atomic_queue->cons_head, cons_head, cons_next)); + + entry = (void *)hg_atomic_get64(&hg_atomic_queue->ring[cons_head]); + + /* + * If there are other dequeues in progress + * that preceded us, we need to wait for them + * to complete + */ + while (hg_atomic_get32(&hg_atomic_queue->cons_tail) != cons_head) + cpu_spinwait(); + + hg_atomic_set32(&hg_atomic_queue->cons_tail, cons_next); + + return entry; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void * +hg_atomic_queue_pop_sc(struct hg_atomic_queue *hg_atomic_queue) +{ + hg_util_int32_t cons_head, cons_next; + hg_util_int32_t prod_tail; + void * entry = NULL; + + cons_head = hg_atomic_get32(&hg_atomic_queue->cons_head); + prod_tail = hg_atomic_get32(&hg_atomic_queue->prod_tail); + cons_next = (cons_head + 1) & (int)hg_atomic_queue->cons_mask; + + if (cons_head == prod_tail) + /* Empty */ + return NULL; + + hg_atomic_set32(&hg_atomic_queue->cons_head, cons_next); + + entry = (void *)hg_atomic_get64(&hg_atomic_queue->ring[cons_head]); + + hg_atomic_set32(&hg_atomic_queue->cons_tail, cons_next); + + return entry; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_util_bool_t +hg_atomic_queue_is_empty(struct hg_atomic_queue *hg_atomic_queue) +{ + return (hg_atomic_get32(&hg_atomic_queue->cons_head) == hg_atomic_get32(&hg_atomic_queue->prod_tail)); +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE unsigned int +hg_atomic_queue_count(struct hg_atomic_queue *hg_atomic_queue) +{ + return ((hg_atomic_queue->prod_size + (unsigned int)hg_atomic_get32(&hg_atomic_queue->prod_tail) - + (unsigned int)hg_atomic_get32(&hg_atomic_queue->cons_tail)) & + hg_atomic_queue->prod_mask); +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_ATOMIC_QUEUE_H */ diff --git a/src/mercury/src/util/mercury_dlog.c b/src/mercury/src/util/mercury_dlog.c new file mode 100644 index 00000000000..8146691d85c --- /dev/null +++ b/src/mercury/src/util/mercury_dlog.c @@ -0,0 +1,262 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_dlog.h" + +#include +#include +#include +#include +#include + +/****************/ +/* Local Macros */ +/****************/ + +/************************************/ +/* Local Type and Struct Definition */ +/************************************/ + +/********************/ +/* Local Prototypes */ +/********************/ + +/*******************/ +/* Local Variables */ +/*******************/ + +/*---------------------------------------------------------------------------*/ +struct hg_dlog * +hg_dlog_alloc(char *name, unsigned int lesize, int leloop) +{ + struct hg_dlog_entry *le; + struct hg_dlog * d; + + le = malloc(sizeof(*le) * lesize); + if (!le) + return NULL; + + d = malloc(sizeof(*d)); + if (!d) { + free(le); + return NULL; + } + + memset(d, 0, sizeof(*d)); + snprintf(d->dlog_magic, sizeof(d->dlog_magic), "%s%s", HG_DLOG_STDMAGIC, name); + hg_thread_mutex_init(&d->dlock); + HG_LIST_INIT(&d->cnts32); + HG_LIST_INIT(&d->cnts64); + d->le = le; + d->lesize = lesize; + d->leloop = leloop; + d->mallocd = 1; + + return d; +} + +/*---------------------------------------------------------------------------*/ +void +hg_dlog_free(struct hg_dlog *d) +{ + struct hg_dlog_dcount32 *cp32 = HG_LIST_FIRST(&d->cnts32); + struct hg_dlog_dcount64 *cp64 = HG_LIST_FIRST(&d->cnts64); + + while (cp32) { + struct hg_dlog_dcount32 *cp = cp32; + cp32 = HG_LIST_NEXT(cp, l); + free(cp); + } + HG_LIST_INIT(&d->cnts32); + + while (cp64) { + struct hg_dlog_dcount64 *cp = cp64; + cp64 = HG_LIST_NEXT(cp, l); + free(cp); + } + HG_LIST_INIT(&d->cnts64); + + if (d->mallocd) { + free(d->le); + free(d); + } +} + +/*---------------------------------------------------------------------------*/ +void +hg_dlog_mkcount32(struct hg_dlog *d, hg_atomic_int32_t **cptr, const char *name, const char *descr) +{ + struct hg_dlog_dcount32 *dcnt; + + hg_thread_mutex_lock(&d->dlock); + if (*cptr == NULL) { + dcnt = malloc(sizeof(*dcnt)); + if (!dcnt) { + fprintf(stderr, "hd_dlog_mkcount: malloc of %s failed!", name); + abort(); + } + dcnt->name = name; + dcnt->descr = descr; + hg_atomic_init32(&dcnt->c, 0); + HG_LIST_INSERT_HEAD(&d->cnts32, dcnt, l); + *cptr = &dcnt->c; /* set it in caller's variable */ + } + hg_thread_mutex_unlock(&d->dlock); +} + +/*---------------------------------------------------------------------------*/ +void +hg_dlog_mkcount64(struct hg_dlog *d, hg_atomic_int64_t **cptr, const char *name, const char *descr) +{ + struct hg_dlog_dcount64 *dcnt; + + hg_thread_mutex_lock(&d->dlock); + if (*cptr == NULL) { + dcnt = malloc(sizeof(*dcnt)); + if (!dcnt) { + fprintf(stderr, "hd_dlog_mkcount: malloc of %s failed!", name); + abort(); + } + dcnt->name = name; + dcnt->descr = descr; + hg_atomic_init64(&dcnt->c, 0); + HG_LIST_INSERT_HEAD(&d->cnts64, dcnt, l); + *cptr = &dcnt->c; /* set it in caller's variable */ + } + hg_thread_mutex_unlock(&d->dlock); +} + +/*---------------------------------------------------------------------------*/ +void +hg_dlog_setlogstop(struct hg_dlog *d, int stop) +{ + d->lestop = stop; /* no need to lock */ +} + +/*---------------------------------------------------------------------------*/ +void +hg_dlog_resetlog(struct hg_dlog *d) +{ + hg_thread_mutex_lock(&d->dlock); + d->lefree = 0; + d->leadds = 0; + hg_thread_mutex_unlock(&d->dlock); +} + +/*---------------------------------------------------------------------------*/ +void +hg_dlog_dump(struct hg_dlog *d, int (*log_func)(FILE *, const char *, ...), FILE *stream, int trylock) +{ + unsigned int left, idx; + struct hg_dlog_dcount32 *dc32; + struct hg_dlog_dcount64 *dc64; + + if (trylock) { + int try_ret = hg_thread_mutex_try_lock(&d->dlock); + if (try_ret != HG_UTIL_SUCCESS) /* warn them, but keep going */ { + fprintf(stderr, "hg_dlog_dump: WARN - lock failed\n"); + return; + } + } + else + hg_thread_mutex_lock(&d->dlock); + + if (d->leadds > 0) { + log_func(stream, + "### ----------------------\n" + "### (%s) debug log summary\n" + "### ----------------------\n", + (d->dlog_magic + strlen(HG_DLOG_STDMAGIC))); + if (!HG_LIST_IS_EMPTY(&d->cnts32) && !HG_LIST_IS_EMPTY(&d->cnts64)) { + log_func(stream, "# Counters\n"); + HG_LIST_FOREACH(dc32, &d->cnts32, l) + { + log_func(stream, "# %s: %" PRId32 " [%s]\n", dc32->name, hg_atomic_get32(&dc32->c), + dc32->descr); + } + HG_LIST_FOREACH(dc64, &d->cnts64, l) + { + log_func(stream, "# %s: %" PRId64 " [%s]\n", dc64->name, hg_atomic_get64(&dc64->c), + dc64->descr); + } + log_func(stream, "# -\n"); + } + + log_func(stream, "# Number of log entries: %d\n", d->leadds); + + idx = (d->lefree < d->leadds) ? d->lesize + d->lefree - d->leadds : d->lefree - d->leadds; + left = d->leadds; + while (left--) { + log_func(stream, "# [%lf] %s:%d\n## %s()\n", hg_time_to_double(d->le[idx].time), d->le[idx].file, + d->le[idx].line, d->le[idx].func); + idx = (idx + 1) % d->lesize; + } + } + + hg_thread_mutex_unlock(&d->dlock); +} + +/*---------------------------------------------------------------------------*/ +void +hg_dlog_dump_file(struct hg_dlog *d, const char *base, int addpid, int trylock) +{ + char buf[BUFSIZ]; + int pid = getpid(); + FILE * fp = NULL; + unsigned int left, idx; + struct hg_dlog_dcount32 *dc32; + struct hg_dlog_dcount64 *dc64; + + if (addpid) + snprintf(buf, sizeof(buf), "%s-%d.log", base, pid); + else + snprintf(buf, sizeof(buf), "%s.log", base); + + fp = fopen(buf, "w"); + if (!fp) { + perror("fopen"); + return; + } + + if (trylock) { + int try_ret = hg_thread_mutex_try_lock(&d->dlock); + if (try_ret != HG_UTIL_SUCCESS) /* warn them, but keep going */ { + fprintf(stderr, "hg_dlog_dump: WARN - lock failed\n"); + fclose(fp); + return; + } + } + else + hg_thread_mutex_lock(&d->dlock); + + fprintf(fp, "# START COUNTERS\n"); + HG_LIST_FOREACH(dc32, &d->cnts32, l) + { + fprintf(fp, "%s %d %" PRId32 " # %s\n", dc32->name, pid, hg_atomic_get32(&dc32->c), dc32->descr); + } + HG_LIST_FOREACH(dc64, &d->cnts64, l) + { + fprintf(fp, "%s %d %" PRId64 " # %s\n", dc64->name, pid, hg_atomic_get64(&dc64->c), dc64->descr); + } + fprintf(fp, "# END COUNTERS\n\n"); + + fprintf(fp, "# NLOGS %d FOR %d\n", d->leadds, pid); + + idx = (d->lefree < d->leadds) ? d->lesize + d->lefree - d->leadds : d->lefree - d->leadds; + left = d->leadds; + while (left--) { + fprintf(fp, "%lf %d %s %u %s %s %p\n", hg_time_to_double(d->le[idx].time), pid, d->le[idx].file, + d->le[idx].line, d->le[idx].func, d->le[idx].msg, d->le[idx].data); + idx = (idx + 1) % d->lesize; + } + + hg_thread_mutex_unlock(&d->dlock); + fclose(fp); +} diff --git a/src/mercury/src/util/mercury_dlog.h b/src/mercury/src/util/mercury_dlog.h new file mode 100644 index 00000000000..557b7451797 --- /dev/null +++ b/src/mercury/src/util/mercury_dlog.h @@ -0,0 +1,273 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_DLOG_H +#define MERCURY_DLOG_H + +#include "mercury_util_config.h" + +#include "mercury_atomic.h" +#include "mercury_list.h" +#include "mercury_thread_mutex.h" +#include "mercury_time.h" + +#include + +/*****************/ +/* Public Macros */ +/*****************/ + +/* + * putting a magic number at the front of the dlog allows us to search + * for a dlog in a coredump file after a crash and examine its contents. + */ +#define HG_DLOG_MAGICLEN 16 /* bytes to reserve for magic# */ +#define HG_DLOG_STDMAGIC ">D.LO.G<" /* standard for first 8 bytes */ + +/* + * HG_DLOG_INITIALIZER: initializer for a dlog in a global variable. + * LESIZE is the number of entries in the LE array. use it like this: + * + * #define FOO_NENTS 128 + * struct hg_dlog_entry foo_le[FOO_NENTS]; + * struct hg_dlog foo_dlog = HG_DLOG_INITIALIZER("foo", foo_le, FOO_NENTS, 0); + */ +#define HG_DLOG_INITIALIZER(NAME, LE, LESIZE, LELOOP) \ + { \ + HG_DLOG_STDMAGIC NAME, HG_THREAD_MUTEX_INITIALIZER, HG_LIST_HEAD_INITIALIZER(cnts32), \ + HG_LIST_HEAD_INITIALIZER(cnts64), LE, LESIZE, LELOOP, 0, 0, 0, 0 \ + } + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/* + * hg_dlog_entry: an entry in the dlog + */ +struct hg_dlog_entry { + const char * file; /* file name */ + unsigned int line; /* line number */ + const char * func; /* function name */ + const char * msg; /* entry message (optional) */ + const void * data; /* user data (optional) */ + hg_time_t time; /* time added to log */ +}; + +/* + * hg_dlog_dcount32: 32-bit debug counter in the dlog + */ +struct hg_dlog_dcount32 { + const char * name; /* counter name (short) */ + const char * descr; /* description of counter */ + hg_atomic_int32_t c; /* the counter itself */ + HG_LIST_ENTRY(hg_dlog_dcount32) l; /* linkage */ +}; + +/* + * hg_dlog_dcount64: 64-bit debug counter in the dlog + */ +struct hg_dlog_dcount64 { + const char * name; /* counter name (short) */ + const char * descr; /* description of counter */ + hg_atomic_int64_t c; /* the counter itself */ + HG_LIST_ENTRY(hg_dlog_dcount64) l; /* linkage */ +}; + +/* + * hg_dlog: main structure + */ +struct hg_dlog { + char dlog_magic[HG_DLOG_MAGICLEN]; /* magic number + name */ + hg_thread_mutex_t dlock; /* lock for this data struct */ + + /* counter lists */ + HG_LIST_HEAD(hg_dlog_dcount32) cnts32; /* counter list */ + HG_LIST_HEAD(hg_dlog_dcount64) cnts64; /* counter list */ + + /* log */ + struct hg_dlog_entry *le; /* array of log entries */ + unsigned int lesize; /* size of le[] array */ + int leloop; /* circular buffer? */ + unsigned int lefree; /* next free entry in le[] */ + unsigned int leadds; /* #adds done if < lesize */ + int lestop; /* stop taking new logs */ + + int mallocd; /* allocated with malloc? */ +}; + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * malloc and return a new dlog + * + * \param name [IN] name of dlog (truncated past 8 bytes) + * \param lesize [IN] number of entries to allocate for log buffer + * \param leloop [IN] set to make log circular (can overwrite old + * entries) + * + * \return the new dlog or NULL on malloc error + */ +HG_UTIL_PUBLIC struct hg_dlog *hg_dlog_alloc(char *name, unsigned int lesize, int leloop); + +/** + * free anything we malloc'd on a dlog. assumes we have the final + * active reference to dlog and it won't be used anymore after this + * call (so no need to lock it). + * + * \param d [IN] the dlog to finalize + */ +HG_UTIL_PUBLIC void hg_dlog_free(struct hg_dlog *d); + +/** + * make a named atomic32 counter in a dlog and return a pointer to + * it. we use the dlock to ensure a counter under a given name only + * gets created once (makes it easy to share a counter across files). + * aborts if unable to alloc counter. use it like this: + * + * hg_atomic_int32_t *foo_count; + * static int init = 0; + * if (init == 0) { + * hg_dlog_mkcount32(dlog, &foo_count, "foocount", "counts of foo"); + * init = 1; + * } + * + * \param d [IN] dlog to create the counter in + * \param cptr [IN/OUT] pointer to use for counter (set to NULL to + * start) + * \param name [IN] short one word name for counter + * \param descr [IN] short description of counter + */ +HG_UTIL_PUBLIC void hg_dlog_mkcount32(struct hg_dlog *d, hg_atomic_int32_t **cptr, const char *name, + const char *descr); + +/** + * make a named atomic64 counter in a dlog and return a pointer to + * it. we use the dlock to ensure a counter under a given name only + * gets created once (makes it easy to share a counter across files). + * aborts if unable to alloc counter. use it like this: + * + * hg_atomic_int64_t *foo_count; + * static int init = 0; + * if (init == 0) { + * hg_dlog_mkcount64(dlog, &foo_count, "foocount", "counts of foo"); + * init = 1; + * } + * + * \param d [IN] dlog to create the counter in + * \param cptr [IN/OUT] pointer to use for counter (set to NULL to + * start) + * \param name [IN] short one word name for counter + * \param descr [IN] short description of counter + */ +HG_UTIL_PUBLIC void hg_dlog_mkcount64(struct hg_dlog *d, hg_atomic_int64_t **cptr, const char *name, + const char *descr); + +/** + * attempt to add a log record to a dlog. the id and msg should point + * to static strings that are valid throughout the life of the program + * (not something that is is on the stack). + * + * \param d [IN] the dlog to add the log record to + * \param file [IN] file entry + * \param line [IN] line entry + * \param func [IN] func entry + * \param msg [IN] log entry message (optional, NULL ok) + * \param data [IN] user data pointer for record (optional, NULL ok) + * + * \return 1 if added, 0 otherwise + */ +static HG_UTIL_INLINE unsigned int hg_dlog_addlog(struct hg_dlog *d, const char *file, unsigned int line, + const char *func, const char *msg, const void *data); + +/** + * set the value of stop for a dlog (to enable/disable logging) + * + * \param d [IN] dlog to set stop in + * \param stop [IN] value of stop to use (1=stop, 0=go) + */ +HG_UTIL_PUBLIC void hg_dlog_setlogstop(struct hg_dlog *d, int stop); + +/** + * reset the log. this does not change the counters (since users + * have direct access to the hg_atomic_int64_t's, we don't need + * an API to change them here). + * + * \param d [IN] dlog to reset + */ +HG_UTIL_PUBLIC void hg_dlog_resetlog(struct hg_dlog *d); + +/** + * dump dlog info to a stream. set trylock if you want to dump even + * if it is locked (e.g. you are crashing and you don't care about + * locking). + * + * \param d [IN] dlog to dump + * \param log_func [IN] log function to use (default printf) + * \param stream [IN] stream to use + * \param trylock [IN] just try to lock (warn if it fails) + */ +HG_UTIL_PUBLIC void hg_dlog_dump(struct hg_dlog *d, int (*log_func)(FILE *, const char *, ...), FILE *stream, + int trylock); + +/** + * dump dlog info to a file. set trylock if you want to dump even + * if it is locked (e.g. you are crashing and you don't care about + * locking). the output file is "base.log" or base-pid.log" depending + * on the value of addpid. + * + * \param d [IN] dlog to dump + * \param base [IN] output file basename + * \param addpid [IN] add pid to output filename + * \param trylock [IN] just try to lock (warn if it fails) + */ +HG_UTIL_PUBLIC void hg_dlog_dump_file(struct hg_dlog *d, const char *base, int addpid, int trylock); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE unsigned int +hg_dlog_addlog(struct hg_dlog *d, const char *file, unsigned int line, const char *func, const char *msg, + const void *data) +{ + unsigned int rv = 0; + unsigned int idx; + + hg_thread_mutex_lock(&d->dlock); + if (d->lestop) + goto done; + if (d->leloop == 0 && d->leadds >= d->lesize) + goto done; + idx = d->lefree; + d->lefree = (d->lefree + 1) % d->lesize; + if (d->leadds < d->lesize) + d->leadds++; + d->le[idx].file = file; + d->le[idx].line = line; + d->le[idx].func = func; + d->le[idx].msg = msg; + d->le[idx].data = data; + hg_time_get_current(&d->le[idx].time); + rv = 1; + +done: + hg_thread_mutex_unlock(&d->dlock); + return rv; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_DLOG_H */ diff --git a/src/mercury/src/util/mercury_event.c b/src/mercury/src/util/mercury_event.c new file mode 100644 index 00000000000..f7d5bb9de81 --- /dev/null +++ b/src/mercury/src/util/mercury_event.c @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_event.h" + +#include "mercury_util_error.h" + +/*---------------------------------------------------------------------------*/ +int +hg_event_create(void) +{ + int fd = -1; +#if defined(_WIN32) + +#elif defined(HG_UTIL_HAS_SYSEVENTFD_H) + /* Create local signal event on self address */ + fd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE); + HG_UTIL_CHECK_ERROR_NORET(fd == -1, done, "eventfd() failed (%s)", strerror(errno)); +#elif defined(HG_UTIL_HAS_SYSEVENT_H) + struct kevent kev; + struct timespec timeout = {0, 0}; + int rc; + + /* Create kqueue */ + fd = kqueue(); + HG_UTIL_CHECK_ERROR_NORET(fd == -1, done, "kqueue() failed (%s)", strerror(errno)); + + EV_SET(&kev, HG_EVENT_IDENT, EVFILT_USER, EV_ADD | EV_CLEAR, 0, 0, NULL); + + /* Add user-defined event to kqueue */ + rc = kevent(fd, &kev, 1, NULL, 0, &timeout); + HG_UTIL_CHECK_ERROR_NORET(rc == -1, error, "kevent() failed (%s)", strerror(errno)); +#else + +#endif + HG_UTIL_LOG_DEBUG("Created event fd=%d", fd); + +done: + return fd; + +#if defined(HG_UTIL_HAS_SYSEVENT_H) +error: + hg_event_destroy(fd); + + return -1; +#endif +} + +/*---------------------------------------------------------------------------*/ +int +hg_event_destroy(int fd) +{ + int ret = HG_UTIL_SUCCESS, rc; +#if defined(_WIN32) + +#else + rc = close(fd); + HG_UTIL_CHECK_ERROR(rc == -1, done, ret, HG_UTIL_FAIL, "close() failed (%s)", strerror(errno)); +#endif + HG_UTIL_LOG_DEBUG("Destroyed event fd=%d", fd); + +done: + return ret; +} diff --git a/src/mercury/src/util/mercury_event.h b/src/mercury/src/util/mercury_event.h new file mode 100644 index 00000000000..8be18a5c992 --- /dev/null +++ b/src/mercury/src/util/mercury_event.h @@ -0,0 +1,179 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_EVENT_H +#define MERCURY_EVENT_H + +#include "mercury_util_config.h" + +#ifdef _WIN32 + +#else +#include +#include +#include +#if defined(HG_UTIL_HAS_SYSEVENTFD_H) +#include +#ifndef HG_UTIL_HAS_EVENTFD_T +typedef uint64_t eventfd_t; +#endif +#elif defined(HG_UTIL_HAS_SYSEVENT_H) +#include +#define HG_EVENT_IDENT 42 /* User-defined ident */ +#endif +#endif + +/** + * Purpose: define an event object that can be used as an event + * wait/notify mechanism. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Create a new event object. + * + * \return file descriptor on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_event_create(void); + +/** + * Destroy an event object. + * + * \param fd [IN] event file descriptor + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_event_destroy(int fd); + +/** + * Notify for event. + * + * \param fd [IN] event file descriptor + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_event_set(int fd); + +/** + * Get event notification. + * + * \param fd [IN] event file descriptor + * \param notified [IN] boolean set to HG_UTIL_TRUE if event received + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_event_get(int fd, hg_util_bool_t *notified); + +/*---------------------------------------------------------------------------*/ +#if defined(_WIN32) +/* TODO */ +#elif defined(HG_UTIL_HAS_SYSEVENTFD_H) +#ifdef HG_UTIL_HAS_EVENTFD_T +static HG_UTIL_INLINE int +hg_event_set(int fd) +{ + return (eventfd_write(fd, 1) == 0) ? HG_UTIL_SUCCESS : HG_UTIL_FAIL; +} +#else +static HG_UTIL_INLINE int +hg_event_set(int fd) +{ + eventfd_t count = 1; + ssize_t s = write(fd, &count, sizeof(eventfd_t)); + + return (s == sizeof(eventfd_t)) ? HG_UTIL_SUCCESS : HG_UTIL_FAIL; +} +#endif +#elif defined(HG_UTIL_HAS_SYSEVENT_H) +static HG_UTIL_INLINE int +hg_event_set(int fd) +{ + struct kevent kev; + struct timespec timeout = {0, 0}; + int rc; + + EV_SET(&kev, HG_EVENT_IDENT, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL); + + /* Trigger user-defined event */ + rc = kevent(fd, &kev, 1, NULL, 0, &timeout); + + return (rc == -1) ? HG_UTIL_FAIL : HG_UTIL_SUCCESS; +} +#else +#error "Not supported on this platform." +#endif + +/*---------------------------------------------------------------------------*/ +#if defined(_WIN32) +#elif defined(HG_UTIL_HAS_SYSEVENTFD_H) +#ifdef HG_UTIL_HAS_EVENTFD_T +static HG_UTIL_INLINE int +hg_event_get(int fd, hg_util_bool_t *signaled) +{ + eventfd_t count = 0; + + if ((eventfd_read(fd, &count) == 0) && count) + *signaled = HG_UTIL_TRUE; + else { + if (errno == EAGAIN) + *signaled = HG_UTIL_FALSE; + else + return HG_UTIL_FAIL; + } + + return HG_UTIL_SUCCESS; +} +#else +static HG_UTIL_INLINE int +hg_event_get(int fd, hg_util_bool_t *signaled) +{ + eventfd_t count = 0; + ssize_t s = read(fd, &count, sizeof(eventfd_t)); + if ((s == sizeof(eventfd_t)) && count) + *signaled = HG_UTIL_TRUE; + else { + if (errno == EAGAIN) + *signaled = HG_UTIL_FALSE; + else + return HG_UTIL_FAIL; + } + + return HG_UTIL_SUCCESS; +} +#endif +#elif defined(HG_UTIL_HAS_SYSEVENT_H) +static HG_UTIL_INLINE int +hg_event_get(int fd, hg_util_bool_t *signaled) +{ + struct kevent kev; + int nfds; + struct timespec timeout = {0, 0}; + + /* Check user-defined event */ + nfds = kevent(fd, NULL, 0, &kev, 1, &timeout); + if (nfds == -1) + return HG_UTIL_FAIL; + + *signaled = ((nfds > 0) && (kev.ident == HG_EVENT_IDENT)) ? HG_UTIL_TRUE : HG_UTIL_FALSE; + + return HG_UTIL_SUCCESS; +} +#else +#error "Not supported on this platform." +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_EVENT_H */ diff --git a/src/mercury/src/util/mercury_hash_string.h b/src/mercury/src/util/mercury_hash_string.h new file mode 100644 index 00000000000..0b136ca8554 --- /dev/null +++ b/src/mercury/src/util/mercury_hash_string.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_HASH_STRING_H +#define MERCURY_HASH_STRING_H + +#include "mercury_util_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Hash function name for unique ID to register. + * + * \param string [IN] string name + * + * \return Non-negative ID that corresponds to string name + */ +static HG_UTIL_INLINE unsigned int +hg_hash_string(const char *string) +{ + /* This is the djb2 string hash function */ + + unsigned int result = 5381; + const unsigned char *p; + + p = (const unsigned char *)string; + + while (*p != '\0') { + result = (result << 5) + result + *p; + ++p; + } + return result; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_HASH_STRING_H */ diff --git a/src/mercury/src/util/mercury_hash_table.c b/src/mercury/src/util/mercury_hash_table.c new file mode 100644 index 00000000000..b6d29cf5021 --- /dev/null +++ b/src/mercury/src/util/mercury_hash_table.c @@ -0,0 +1,435 @@ +/* +Copyright (c) 2005-2008, Simon Howard + +Permission to use, copy, modify, and/or distribute this software +for any purpose with or without fee is hereby granted, provided +that the above copyright notice and this permission notice appear +in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE +AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR +CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, +NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* Hash table implementation */ + +#include "mercury_hash_table.h" + +#include +#include + +struct hg_hash_table_entry { + hg_hash_table_key_t key; + hg_hash_table_value_t value; + hg_hash_table_entry_t *next; +}; + +struct hg_hash_table { + hg_hash_table_entry_t ** table; + unsigned int table_size; + hg_hash_table_hash_func_t hash_func; + hg_hash_table_equal_func_t equal_func; + hg_hash_table_key_free_func_t key_free_func; + hg_hash_table_value_free_func_t value_free_func; + unsigned int entries; + unsigned int prime_index; +}; + +/* This is a set of good hash table prime numbers, from: + * http://planetmath.org/goodhashtableprimes + * Each prime is roughly double the previous value, and as far as + * possible from the nearest powers of two. */ + +static const unsigned int hash_table_primes[] = { + 193, 389, 769, 1543, 3079, 6151, 12289, 24593, + 49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469, + 12582917, 25165843, 50331653, 100663319, 201326611, 402653189, 805306457, 1610612741, +}; + +static const unsigned int hash_table_num_primes = sizeof(hash_table_primes) / sizeof(int); + +/* Internal function used to allocate the table on hash table creation + * and when enlarging the table */ +static int +hash_table_allocate_table(hg_hash_table_t *hash_table) +{ + unsigned int new_table_size; + + /* Determine the table size based on the current prime index. + * An attempt is made here to ensure sensible behavior if the + * maximum prime is exceeded, but in practice other things are + * likely to break long before that happens. */ + + if (hash_table->prime_index < hash_table_num_primes) + new_table_size = hash_table_primes[hash_table->prime_index]; + else + new_table_size = hash_table->entries * 10; + + hash_table->table_size = new_table_size; + + /* Allocate the table and initialise to NULL for all entries */ + hash_table->table = + (hg_hash_table_entry_t **)calloc(hash_table->table_size, sizeof(hg_hash_table_entry_t *)); + if (hash_table->table == NULL) + return 0; + + return 1; +} + +/* Free an entry, calling the free functions if there are any registered */ +static void +hash_table_free_entry(hg_hash_table_t *hash_table, hg_hash_table_entry_t *entry) +{ + /* If there is a function registered for freeing keys, use it to free + * the key */ + if (hash_table->key_free_func != NULL) + hash_table->key_free_func(entry->key); + + /* Likewise with the value */ + if (hash_table->value_free_func != NULL) + hash_table->value_free_func(entry->value); + + /* Free the data structure */ + free(entry); +} + +hg_hash_table_t * +hg_hash_table_new(hg_hash_table_hash_func_t hash_func, hg_hash_table_equal_func_t equal_func) +{ + hg_hash_table_t *hash_table; + + /* Allocate a new hash table structure */ + + hash_table = (hg_hash_table_t *)malloc(sizeof(hg_hash_table_t)); + + if (hash_table == NULL) + return NULL; + + hash_table->hash_func = hash_func; + hash_table->equal_func = equal_func; + hash_table->key_free_func = NULL; + hash_table->value_free_func = NULL; + hash_table->entries = 0; + hash_table->prime_index = 0; + + /* Allocate the table */ + if (!hash_table_allocate_table(hash_table)) { + free(hash_table); + + return NULL; + } + + return hash_table; +} + +void +hg_hash_table_free(hg_hash_table_t *hash_table) +{ + hg_hash_table_entry_t *rover; + hg_hash_table_entry_t *next; + unsigned int i; + + /* Free all entries in all chains */ + + for (i = 0; i < hash_table->table_size; ++i) { + rover = hash_table->table[i]; + while (rover != NULL) { + next = rover->next; + hash_table_free_entry(hash_table, rover); + rover = next; + } + } + + /* Free the table */ + free(hash_table->table); + + /* Free the hash table structure */ + free(hash_table); +} + +void +hg_hash_table_register_free_functions(hg_hash_table_t * hash_table, + hg_hash_table_key_free_func_t key_free_func, + hg_hash_table_value_free_func_t value_free_func) +{ + hash_table->key_free_func = key_free_func; + hash_table->value_free_func = value_free_func; +} + +static int +hash_table_enlarge(hg_hash_table_t *hash_table) +{ + hg_hash_table_entry_t **old_table; + unsigned int old_table_size; + unsigned int old_prime_index; + hg_hash_table_entry_t * rover; + hg_hash_table_entry_t * next; + unsigned int entry_index; + unsigned int i; + + /* Store a copy of the old table */ + old_table = hash_table->table; + old_table_size = hash_table->table_size; + old_prime_index = hash_table->prime_index; + + /* Allocate a new, larger table */ + ++hash_table->prime_index; + + if (!hash_table_allocate_table(hash_table)) { + /* Failed to allocate the new table */ + hash_table->table = old_table; + hash_table->table_size = old_table_size; + hash_table->prime_index = old_prime_index; + + return 0; + } + + /* Link all entries from all chains into the new table */ + + for (i = 0; i < old_table_size; ++i) { + rover = old_table[i]; + + while (rover != NULL) { + next = rover->next; + + /* Find the index into the new table */ + entry_index = hash_table->hash_func(rover->key) % hash_table->table_size; + + /* Link this entry into the chain */ + rover->next = hash_table->table[entry_index]; + hash_table->table[entry_index] = rover; + + /* Advance to next in the chain */ + rover = next; + } + } + + /* Free the old table */ + free(old_table); + + return 1; +} + +int +hg_hash_table_insert(hg_hash_table_t *hash_table, hg_hash_table_key_t key, hg_hash_table_value_t value) +{ + hg_hash_table_entry_t *rover; + hg_hash_table_entry_t *newentry; + unsigned int entry_index; + + /* If there are too many items in the table with respect to the table + * size, the number of hash collisions increases and performance + * decreases. Enlarge the table size to prevent this happening */ + + if ((hash_table->entries * 3) / hash_table->table_size > 0) { + + /* Table is more than 1/3 full */ + if (!hash_table_enlarge(hash_table)) { + + /* Failed to enlarge the table */ + + return 0; + } + } + + /* Generate the hash of the key and hence the index into the table */ + entry_index = hash_table->hash_func(key) % hash_table->table_size; + + /* Traverse the chain at this location and look for an existing + * entry with the same key */ + rover = hash_table->table[entry_index]; + + while (rover != NULL) { + if (hash_table->equal_func(rover->key, key) != 0) { + + /* Same key: overwrite this entry with new data */ + + /* If there is a value free function, free the old data + * before adding in the new data */ + if (hash_table->value_free_func != NULL) + hash_table->value_free_func(rover->value); + + /* Same with the key: use the new key value and free + * the old one */ + if (hash_table->key_free_func != NULL) + hash_table->key_free_func(rover->key); + + rover->key = key; + rover->value = value; + + /* Finished */ + return 1; + } + rover = rover->next; + } + + /* Not in the hash table yet. Create a new entry */ + newentry = (hg_hash_table_entry_t *)malloc(sizeof(hg_hash_table_entry_t)); + + if (newentry == NULL) + return 0; + + newentry->key = key; + newentry->value = value; + + /* Link into the list */ + newentry->next = hash_table->table[entry_index]; + hash_table->table[entry_index] = newentry; + + /* Maintain the count of the number of entries */ + ++hash_table->entries; + + /* Added successfully */ + return 1; +} + +hg_hash_table_value_t +hg_hash_table_lookup(hg_hash_table_t *hash_table, hg_hash_table_key_t key) +{ + hg_hash_table_entry_t *rover; + unsigned int entry_index; + + /* Generate the hash of the key and hence the index into the table */ + entry_index = hash_table->hash_func(key) % hash_table->table_size; + + /* Walk the chain at this index until the corresponding entry is + * found */ + rover = hash_table->table[entry_index]; + + while (rover != NULL) { + if (hash_table->equal_func(key, rover->key) != 0) { + /* Found the entry. Return the data. */ + return rover->value; + } + rover = rover->next; + } + + /* Not found */ + return HG_HASH_TABLE_NULL; +} + +int +hg_hash_table_remove(hg_hash_table_t *hash_table, hg_hash_table_key_t key) +{ + hg_hash_table_entry_t **rover; + hg_hash_table_entry_t * entry; + unsigned int entry_index; + int result; + + /* Generate the hash of the key and hence the index into the table */ + entry_index = hash_table->hash_func(key) % hash_table->table_size; + + /* Rover points at the pointer which points at the current entry + * in the chain being inspected. ie. the entry in the table, or + * the "next" pointer of the previous entry in the chain. This + * allows us to unlink the entry when we find it. */ + result = 0; + rover = &hash_table->table[entry_index]; + + while (*rover != NULL) { + if (hash_table->equal_func(key, (*rover)->key) != 0) { + /* This is the entry to remove */ + entry = *rover; + + /* Unlink from the list */ + *rover = entry->next; + + /* Destroy the entry structure */ + hash_table_free_entry(hash_table, entry); + + /* Track count of entries */ + --hash_table->entries; + result = 1; + break; + } + + /* Advance to the next entry */ + rover = &((*rover)->next); + } + + return result; +} + +unsigned int +hg_hash_table_num_entries(hg_hash_table_t *hash_table) +{ + return hash_table->entries; +} + +void +hg_hash_table_iterate(hg_hash_table_t *hash_table, hg_hash_table_iter_t *iterator) +{ + unsigned int chain; + + iterator->hash_table = hash_table; + + /* Default value of next if no entries are found. */ + iterator->next_entry = NULL; + + /* Find the first entry */ + for (chain = 0; chain < hash_table->table_size; ++chain) { + if (hash_table->table[chain] != NULL) { + iterator->next_entry = hash_table->table[chain]; + iterator->next_chain = chain; + break; + } + } +} + +int +hg_hash_table_iter_has_more(hg_hash_table_iter_t *iterator) +{ + return iterator->next_entry != NULL; +} + +hg_hash_table_value_t +hg_hash_table_iter_next(hg_hash_table_iter_t *iterator) +{ + hg_hash_table_entry_t *current_entry; + hg_hash_table_t * hash_table; + hg_hash_table_value_t result; + unsigned int chain; + + hash_table = iterator->hash_table; + + /* No more entries? */ + if (iterator->next_entry == NULL) + return HG_HASH_TABLE_NULL; + + /* Result is immediately available */ + current_entry = iterator->next_entry; + result = current_entry->value; + + /* Find the next entry */ + if (current_entry->next != NULL) { + /* Next entry in current chain */ + iterator->next_entry = current_entry->next; + } + else { + /* None left in this chain, so advance to the next chain */ + chain = iterator->next_chain + 1; + + /* Default value if no next chain found */ + iterator->next_entry = NULL; + + while (chain < hash_table->table_size) { + /* Is there anything in this chain? */ + if (hash_table->table[chain] != NULL) { + iterator->next_entry = hash_table->table[chain]; + break; + } + + /* Try the next chain */ + ++chain; + } + + iterator->next_chain = chain; + } + + return result; +} diff --git a/src/mercury/src/util/mercury_hash_table.h b/src/mercury/src/util/mercury_hash_table.h new file mode 100644 index 00000000000..0063f020cdd --- /dev/null +++ b/src/mercury/src/util/mercury_hash_table.h @@ -0,0 +1,242 @@ +/* + +Copyright (c) 2005-2008, Simon Howard + +Permission to use, copy, modify, and/or distribute this software +for any purpose with or without fee is hereby granted, provided +that the above copyright notice and this permission notice appear +in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE +AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR +CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, +NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + */ + +/** + * \file mercury_hash_table.h + * + * \brief Hash table. + * + * A hash table stores a set of values which can be addressed by a + * key. Given the key, the corresponding value can be looked up + * quickly. + * + * To create a hash table, use \ref hg_hash_table_new. To destroy a + * hash table, use \ref hg_hash_table_free. + * + * To insert a value into a hash table, use \ref hg_hash_table_insert. + * + * To remove a value from a hash table, use \ref hg_hash_table_remove. + * + * To look up a value by its key, use \ref hg_hash_table_lookup. + * + * To iterate over all values in a hash table, use + * \ref hg_hash_table_iterate to initialize a \ref hg_hash_table_iter + * structure. Each value can then be read in turn using + * \ref hg_hash_table_iter_next and \ref hg_hash_table_iter_has_more. + */ + +#ifndef HG_HASH_TABLE_H +#define HG_HASH_TABLE_H + +#include "mercury_util_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * A hash table structure. + */ + +typedef struct hg_hash_table hg_hash_table_t; + +/** + * Structure used to iterate over a hash table. + */ + +typedef struct hg_hash_table_iter hg_hash_table_iter_t; + +/** + * Internal structure representing an entry in a hash table. + */ + +typedef struct hg_hash_table_entry hg_hash_table_entry_t; + +/** + * A key to look up a value in a \ref hg_hash_table_t. + */ + +typedef void *hg_hash_table_key_t; + +/** + * A value stored in a \ref hg_hash_table_t. + */ + +typedef void *hg_hash_table_value_t; + +/** + * Definition of a \ref hg_hash_table_iter. + */ + +struct hg_hash_table_iter { + hg_hash_table_t * hash_table; + hg_hash_table_entry_t *next_entry; + unsigned int next_chain; +}; + +/** + * A null \ref HashTableValue. + */ + +#define HG_HASH_TABLE_NULL ((void *)0) + +/** + * Hash function used to generate hash values for keys used in a hash + * table. + * + * \param value The value to generate a hash value for. + * \return The hash value. + */ + +typedef unsigned int (*hg_hash_table_hash_func_t)(hg_hash_table_key_t value); + +/** + * Function used to compare two keys for equality. + * + * \return Non-zero if the two keys are equal, zero if the keys are + * not equal. + */ + +typedef int (*hg_hash_table_equal_func_t)(hg_hash_table_key_t value1, hg_hash_table_key_t value2); + +/** + * Type of function used to free keys when entries are removed from a + * hash table. + */ + +typedef void (*hg_hash_table_key_free_func_t)(hg_hash_table_key_t value); + +/** + * Type of function used to free values when entries are removed from a + * hash table. + */ + +typedef void (*hg_hash_table_value_free_func_t)(hg_hash_table_value_t value); + +/** + * Create a new hash table. + * + * \param hash_func Function used to generate hash keys for the + * keys used in the table. + * \param equal_func Function used to test keys used in the table + * for equality. + * \return A new hash table structure, or NULL if it + * was not possible to allocate the new hash + * table. + */ +HG_UTIL_PUBLIC hg_hash_table_t *hg_hash_table_new(hg_hash_table_hash_func_t hash_func, + hg_hash_table_equal_func_t equal_func); + +/** + * Destroy a hash table. + * + * \param hash_table The hash table to destroy. + */ +HG_UTIL_PUBLIC void hg_hash_table_free(hg_hash_table_t *hash_table); + +/** + * Register functions used to free the key and value when an entry is + * removed from a hash table. + * + * \param hash_table The hash table. + * \param key_free_func Function used to free keys. + * \param value_free_func Function used to free values. + */ +HG_UTIL_PUBLIC void hg_hash_table_register_free_functions(hg_hash_table_t * hash_table, + hg_hash_table_key_free_func_t key_free_func, + hg_hash_table_value_free_func_t value_free_func); + +/** + * Insert a value into a hash table, overwriting any existing entry + * using the same key. + * + * \param hash_table The hash table. + * \param key The key for the new value. + * \param value The value to insert. + * \return Non-zero if the value was added successfully, + * or zero if it was not possible to allocate + * memory for the new entry. + */ +HG_UTIL_PUBLIC int hg_hash_table_insert(hg_hash_table_t *hash_table, hg_hash_table_key_t key, + hg_hash_table_value_t value); + +/** + * Look up a value in a hash table by key. + * + * \param hash_table The hash table. + * \param key The key of the value to look up. + * \return The value, or \ref HASH_TABLE_NULL if there + * is no value with that key in the hash table. + */ +HG_UTIL_PUBLIC hg_hash_table_value_t hg_hash_table_lookup(hg_hash_table_t * hash_table, + hg_hash_table_key_t key); + +/** + * Remove a value from a hash table. + * + * \param hash_table The hash table. + * \param key The key of the value to remove. + * \return Non-zero if a key was removed, or zero if the + * specified key was not found in the hash table. + */ +HG_UTIL_PUBLIC int hg_hash_table_remove(hg_hash_table_t *hash_table, hg_hash_table_key_t key); + +/** + * Retrieve the number of entries in a hash table. + * + * \param hash_table The hash table. + * \return The number of entries in the hash table. + */ +HG_UTIL_PUBLIC unsigned int hg_hash_table_num_entries(hg_hash_table_t *hash_table); + +/** + * Initialise a \ref HashTableIterator to iterate over a hash table. + * + * \param hash_table The hash table. + * \param iter Pointer to an iterator structure to + * initialise. + */ +HG_UTIL_PUBLIC void hg_hash_table_iterate(hg_hash_table_t *hash_table, hg_hash_table_iter_t *iter); + +/** + * Determine if there are more keys in the hash table to iterate over. + * + * \param iterator The hash table iterator. + * \return Zero if there are no more values to iterate + * over, non-zero if there are more values to + * iterate over. + */ +HG_UTIL_PUBLIC int hg_hash_table_iter_has_more(hg_hash_table_iter_t *iterator); + +/** + * Using a hash table iterator, retrieve the next key. + * + * \param iterator The hash table iterator. + * \return The next key from the hash table, or + * \ref HG_HASH_TABLE_NULL if there are no more + * keys to iterate over. + */ +HG_UTIL_PUBLIC hg_hash_table_value_t hg_hash_table_iter_next(hg_hash_table_iter_t *iterator); + +#ifdef __cplusplus +} +#endif + +#endif /* HG_HASH_TABLE_H */ diff --git a/src/mercury/src/util/mercury_list.h b/src/mercury/src/util/mercury_list.h new file mode 100644 index 00000000000..18ce93af8d3 --- /dev/null +++ b/src/mercury/src/util/mercury_list.h @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Code below is derived from sys/queue.h which follows the below notice: + * + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + */ + +#ifndef MERCURY_LIST_H +#define MERCURY_LIST_H + +#define HG_LIST_HEAD_INITIALIZER(name) \ + { \ + NULL \ + } + +#define HG_LIST_HEAD_INIT(struct_head_name, var_name) \ + struct struct_head_name var_name = HG_LIST_HEAD_INITIALIZER(var_name) + +#define HG_LIST_HEAD_DECL(struct_head_name, struct_entry_name) \ + struct struct_head_name { \ + struct struct_entry_name *head; \ + } + +#define HG_LIST_HEAD(struct_entry_name) \ + struct { \ + struct struct_entry_name *head; \ + } + +#define HG_LIST_ENTRY(struct_entry_name) \ + struct { \ + struct struct_entry_name * next; \ + struct struct_entry_name **prev; \ + } + +#define HG_LIST_INIT(head_ptr) \ + do { \ + (head_ptr)->head = NULL; \ + } while (/*CONSTCOND*/ 0) + +#define HG_LIST_IS_EMPTY(head_ptr) ((head_ptr)->head == NULL) + +#define HG_LIST_FIRST(head_ptr) ((head_ptr)->head) + +#define HG_LIST_NEXT(entry_ptr, entry_field_name) ((entry_ptr)->entry_field_name.next) + +#define HG_LIST_INSERT_AFTER(list_entry_ptr, entry_ptr, entry_field_name) \ + do { \ + if (((entry_ptr)->entry_field_name.next = (list_entry_ptr)->entry_field_name.next) != NULL) \ + (list_entry_ptr)->entry_field_name.next->entry_field_name.prev = \ + &(entry_ptr)->entry_field_name.next; \ + (list_entry_ptr)->entry_field_name.next = (entry_ptr); \ + (entry_ptr)->entry_field_name.prev = &(list_entry_ptr)->entry_field_name.next; \ + } while (/*CONSTCOND*/ 0) + +#define HG_LIST_INSERT_BEFORE(list_entry_ptr, entry_ptr, entry_field_name) \ + do { \ + (entry_ptr)->entry_field_name.prev = (list_entry_ptr)->entry_field_name.prev; \ + (entry_ptr)->entry_field_name.next = (list_entry_ptr); \ + *(list_entry_ptr)->entry_field_name.prev = (entry_ptr); \ + (list_entry_ptr)->entry_field_name.prev = &(entry_ptr)->entry_field_name.next; \ + } while (/*CONSTCOND*/ 0) + +#define HG_LIST_INSERT_HEAD(head_ptr, entry_ptr, entry_field_name) \ + do { \ + if (((entry_ptr)->entry_field_name.next = (head_ptr)->head) != NULL) \ + (head_ptr)->head->entry_field_name.prev = &(entry_ptr)->entry_field_name.next; \ + (head_ptr)->head = (entry_ptr); \ + (entry_ptr)->entry_field_name.prev = &(head_ptr)->head; \ + } while (/*CONSTCOND*/ 0) + +/* TODO would be nice to not have any condition */ +#define HG_LIST_REMOVE(entry_ptr, entry_field_name) \ + do { \ + if ((entry_ptr)->entry_field_name.next != NULL) \ + (entry_ptr)->entry_field_name.next->entry_field_name.prev = (entry_ptr)->entry_field_name.prev; \ + *(entry_ptr)->entry_field_name.prev = (entry_ptr)->entry_field_name.next; \ + } while (/*CONSTCOND*/ 0) + +#define HG_LIST_FOREACH(var, head_ptr, entry_field_name) \ + for ((var) = ((head_ptr)->head); (var); (var) = ((var)->entry_field_name.next)) + +#endif /* MERCURY_LIST_H */ diff --git a/src/mercury/src/util/mercury_log.c b/src/mercury/src/util/mercury_log.c new file mode 100644 index 00000000000..52dc675320e --- /dev/null +++ b/src/mercury/src/util/mercury_log.c @@ -0,0 +1,487 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_log.h" + +#include +#include +#include +#include + +/****************/ +/* Local Macros */ +/****************/ + +/* Make sure it executes first */ +#ifdef HG_UTIL_HAS_ATTR_CONSTRUCTOR_PRIORITY +#define HG_UTIL_CONSTRUCTOR_1 __attribute__((constructor(101))) +#else +#define HG_UTIL_CONSTRUCTOR_1 +#endif + +/* Destructor (used to finalize log outlets) */ +#define HG_UTIL_DESTRUCTOR __attribute__((destructor)) + +/* Max number of subsystems that can be tracked */ +#define HG_LOG_SUBSYS_MAX (16) + +/* Max length of subsystem name (without trailing \0) */ +#define HG_LOG_SUBSYS_NAME_MAX (16) + +/* Log buffer size */ +#define HG_LOG_BUF_MAX (256) + +#ifdef HG_UTIL_HAS_LOG_COLOR +#define HG_LOG_ESC "\033" +#define HG_LOG_RESET HG_LOG_ESC "[0m" +#define HG_LOG_REG HG_LOG_ESC "[0;" +#define HG_LOG_BOLD HG_LOG_ESC "[1;" +#define HG_LOG_RED "31m" +#define HG_LOG_GREEN "32m" +#define HG_LOG_YELLOW "33m" +#define HG_LOG_BLUE "34m" +#define HG_LOG_MAGENTA "35m" +#define HG_LOG_CYAN "36m" +#endif + +/********************/ +/* Local Prototypes */ +/********************/ + +/* Init logs */ +static void hg_log_init(void) HG_UTIL_CONSTRUCTOR_1; + +/* Finalize logs */ +static void hg_log_finalize(void) HG_UTIL_DESTRUCTOR; + +/* Init log level */ +static void hg_log_init_level(void); + +/* Init log subsys */ +static void hg_log_init_subsys(void); + +/* Reset all log levels */ +static void hg_log_outlet_reset_all(void); + +/* Free all attached logs */ +static void hg_log_free_dlogs(void); + +/* Is log active */ +static int hg_log_outlet_active(const char *name); + +/* Update log level of outlet */ +static void hg_log_outlet_update_level(struct hg_log_outlet *hg_log_outlet); + +/* Update level of all outlets */ +static void hg_log_outlet_update_all(void); + +/*******************/ +/* Local Variables */ +/*******************/ + +/* Default log outlet */ +HG_LOG_OUTLET_DECL(hg) = HG_LOG_OUTLET_INITIALIZER(hg, HG_LOG_OFF, NULL, NULL); + +/* List of all registered outlets */ +static HG_QUEUE_HEAD(hg_log_outlet) hg_log_outlets_g = HG_QUEUE_HEAD_INITIALIZER(hg_log_outlets_g); + +/* Default 'printf' log function */ +static int (*hg_log_func_g)(FILE *stream, const char *format, ...) = fprintf; + +/* Default log level */ +static enum hg_log_level hg_log_level_g = HG_LOG_LEVEL_ERROR; + +/* Default log subsystems */ +static char hg_log_subsys_g[HG_LOG_SUBSYS_MAX][HG_LOG_SUBSYS_NAME_MAX + 1] = {{"\0"}}; + +/* Log level string table */ +#define X(a, b, c) b, +static const char *const hg_log_level_name_g[] = {HG_LOG_LEVELS}; +#undef X + +/* Standard log streams */ +#define X(a, b, c) c, +static FILE **const hg_log_std_streams_g[] = {HG_LOG_LEVELS}; +#undef X +static FILE *hg_log_streams_g[HG_LOG_LEVEL_MAX] = {NULL}; + +/* Log colors */ +#ifdef HG_UTIL_HAS_LOG_COLOR +static const char *const hg_log_colors_g[] = {"", HG_LOG_RED, HG_LOG_MAGENTA, HG_LOG_BLUE, HG_LOG_BLUE, ""}; +#endif + +/* Init */ +#ifndef HG_UTIL_HAS_ATTR_CONSTRUCTOR_PRIORITY +static hg_util_bool_t hg_log_init_g = HG_UTIL_FALSE; +#endif + +/*---------------------------------------------------------------------------*/ +static void +hg_log_init(void) +{ + hg_log_init_level(); + hg_log_init_subsys(); + + /* Register top outlet */ + hg_log_outlet_register(&HG_LOG_OUTLET(hg)); +} + +/*---------------------------------------------------------------------------*/ +static void +hg_log_finalize(void) +{ + hg_log_free_dlogs(); +} + +/*---------------------------------------------------------------------------*/ +static void +hg_log_init_level(void) +{ + const char *log_level = getenv("HG_LOG_LEVEL"); + + /* Override default log level */ + if (log_level == NULL) + return; + + hg_log_set_level(hg_log_name_to_level(log_level)); +} + +/*---------------------------------------------------------------------------*/ +static void +hg_log_init_subsys(void) +{ + const char *log_subsys = getenv("HG_LOG_SUBSYS"); + + if (log_subsys == NULL) + return; + + // fprintf(stderr, "subsys: %s\n", log_subsys); + hg_log_set_subsys(log_subsys); +} + +/*---------------------------------------------------------------------------*/ +static void +hg_log_outlet_reset_all(void) +{ + struct hg_log_outlet *outlet; + int i; + + /* Reset levels */ + HG_QUEUE_FOREACH(outlet, &hg_log_outlets_g, entry) + outlet->level = HG_LOG_LEVEL_NONE; + + /* Reset subsys */ + for (i = 0; i < HG_LOG_SUBSYS_MAX; i++) + strcpy(hg_log_subsys_g[i], "\0"); +} + +/*---------------------------------------------------------------------------*/ +static void +hg_log_free_dlogs(void) +{ + struct hg_log_outlet *outlet; + + /* Free logs if any was attached */ + HG_QUEUE_FOREACH(outlet, &hg_log_outlets_g, entry) + if (outlet->debug_log) + hg_dlog_free(outlet->debug_log); +} + +/*---------------------------------------------------------------------------*/ +static int +hg_log_outlet_active(const char *name) +{ + int i = 0; + + while (hg_log_subsys_g[i][0] != '\0' && i < HG_LOG_SUBSYS_MAX) { + /* Force a subsystem to be inactive */ + if ((hg_log_subsys_g[i][0] == '~') && (strcmp(&hg_log_subsys_g[i][1], name) == 0)) + return -1; + + if (strcmp(hg_log_subsys_g[i], name) == 0) { + return 1; + } + i++; + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +static void +hg_log_outlet_update_level(struct hg_log_outlet *hg_log_outlet) +{ + int active = hg_log_outlet_active(hg_log_outlet->name); + + if (active > 0 || hg_log_outlet->state == HG_LOG_ON) + hg_log_outlet->level = hg_log_level_g; + else if (!(active < 0) && hg_log_outlet->state == HG_LOG_PASS && hg_log_outlet->parent) + hg_log_outlet->level = hg_log_outlet->parent->level; + else + hg_log_outlet->level = HG_LOG_LEVEL_NONE; +} + +/*---------------------------------------------------------------------------*/ +static void +hg_log_outlet_update_all(void) +{ + struct hg_log_outlet *hg_log_outlet; + + HG_QUEUE_FOREACH(hg_log_outlet, &hg_log_outlets_g, entry) + hg_log_outlet_update_level(hg_log_outlet); +} + +/*---------------------------------------------------------------------------*/ +void +hg_log_set_level(enum hg_log_level log_level) +{ + hg_log_level_g = log_level; + + hg_log_outlet_update_all(); +} + +/*---------------------------------------------------------------------------*/ +enum hg_log_level +hg_log_get_level(void) +{ + return hg_log_level_g; +} + +/*---------------------------------------------------------------------------*/ +void +hg_log_set_subsys(const char *log_subsys) +{ + char *subsys, *current, *next; + int i = 0; + + subsys = strdup(log_subsys); + if (!subsys) + return; + + current = subsys; + + /* Reset all */ + hg_log_outlet_reset_all(); + + /* Enable each of the subsys */ + while (strtok_r(current, ",", &next) && i < HG_LOG_SUBSYS_MAX) { + int j, exist = 0; + + /* Skip duplicates */ + for (j = 0; j < i; j++) { + if (strcmp(current, hg_log_subsys_g[j]) == 0) { + exist = 1; + break; + } + } + + if (!exist) { + strncpy(hg_log_subsys_g[i], current, HG_LOG_SUBSYS_NAME_MAX); + i++; + } + current = next; + } + + /* Update outlets */ + hg_log_outlet_update_all(); + + free(subsys); +} + +/*---------------------------------------------------------------------------*/ +const char * +hg_log_get_subsys(void) +{ + static char log_subsys[HG_LOG_SUBSYS_MAX * (HG_LOG_SUBSYS_NAME_MAX + 2)] = "\0"; + char * p = log_subsys; + int i = 0; + + while (hg_log_subsys_g[i][0] != '\0' && i < HG_LOG_SUBSYS_MAX) { + strcpy(p, hg_log_subsys_g[i]); + p += strlen(hg_log_subsys_g[i]); + *p = ','; + p++; + i++; + } + if (i > 0) + *(p - 1) = '\0'; + + return (const char *)log_subsys; +} + +/*---------------------------------------------------------------------------*/ +void +hg_log_set_subsys_level(const char *subsys, enum hg_log_level log_level) +{ + const char *log_subsys = hg_log_get_subsys(); + char * new_subsys = NULL; + const char *new_subsys_ptr; + + if (strcmp(log_subsys, "") != 0) { + new_subsys = malloc(strlen(log_subsys) + strlen(subsys) + 2); + if (!new_subsys) + return; + strcpy(new_subsys, log_subsys); + strcat(new_subsys, ","); + strcat(new_subsys, subsys); + new_subsys_ptr = new_subsys; + } + else + new_subsys_ptr = subsys; + + hg_log_set_level(log_level); + hg_log_set_subsys(new_subsys_ptr); + + free(new_subsys); +} + +/*---------------------------------------------------------------------------*/ +enum hg_log_level +hg_log_name_to_level(const char *log_level) +{ + enum hg_log_level l = 0; + + if (!log_level) + return HG_LOG_LEVEL_NONE; + + while (strcasecmp(hg_log_level_name_g[l], log_level) != 0 && l != HG_LOG_LEVEL_MAX) + l++; + + if (l == HG_LOG_LEVEL_MAX) { + fprintf(stderr, "Warning: invalid log level was passed, defaulting to none\n"); + return HG_LOG_LEVEL_NONE; + } + + return l; +} + +/*---------------------------------------------------------------------------*/ +void +hg_log_set_func(int (*log_func)(FILE *stream, const char *format, ...)) +{ + hg_log_func_g = log_func; +} + +/*---------------------------------------------------------------------------*/ +void +hg_log_set_stream_debug(FILE *stream) +{ + hg_log_streams_g[HG_LOG_LEVEL_DEBUG] = stream; +} + +/*---------------------------------------------------------------------------*/ +FILE * +hg_log_get_stream_debug(void) +{ + return hg_log_streams_g[HG_LOG_LEVEL_DEBUG] ? hg_log_streams_g[HG_LOG_LEVEL_DEBUG] + : *hg_log_std_streams_g[HG_LOG_LEVEL_DEBUG]; +} + +/*---------------------------------------------------------------------------*/ +void +hg_log_set_stream_warning(FILE *stream) +{ + hg_log_streams_g[HG_LOG_LEVEL_WARNING] = stream; +} + +/*---------------------------------------------------------------------------*/ +FILE * +hg_log_get_stream_warning(void) +{ + return hg_log_streams_g[HG_LOG_LEVEL_WARNING] ? hg_log_streams_g[HG_LOG_LEVEL_WARNING] + : *hg_log_std_streams_g[HG_LOG_LEVEL_WARNING]; +} + +/*---------------------------------------------------------------------------*/ +void +hg_log_set_stream_error(FILE *stream) +{ + hg_log_streams_g[HG_LOG_LEVEL_ERROR] = stream; +} + +/*---------------------------------------------------------------------------*/ +FILE * +hg_log_get_stream_error(void) +{ + return hg_log_streams_g[HG_LOG_LEVEL_ERROR] ? hg_log_streams_g[HG_LOG_LEVEL_ERROR] + : *hg_log_std_streams_g[HG_LOG_LEVEL_ERROR]; +} + +/*---------------------------------------------------------------------------*/ +void +hg_log_outlet_register(struct hg_log_outlet *hg_log_outlet) +{ +#ifndef HG_UTIL_HAS_ATTR_CONSTRUCTOR_PRIORITY + if (!hg_log_init_g) { + /* Set here to prevent infinite loop */ + hg_log_init_g = HG_UTIL_TRUE; + hg_log_init(); + } +#endif + + hg_log_outlet_update_level(hg_log_outlet); + + /* Inherit debug log if not set and parent has one */ + if (!hg_log_outlet->debug_log && hg_log_outlet->parent && hg_log_outlet->parent->debug_log) + hg_log_outlet->debug_log = hg_log_outlet->parent->debug_log; + + HG_QUEUE_PUSH_TAIL(&hg_log_outlets_g, hg_log_outlet, entry); +} + +/*---------------------------------------------------------------------------*/ +void +hg_log_write(struct hg_log_outlet *hg_log_outlet, enum hg_log_level log_level, const char *file, + unsigned int line, const char *func, const char *format, ...) +{ + char buf[HG_LOG_BUF_MAX]; + FILE * stream = NULL; + const char *level_name = NULL; +#ifdef HG_UTIL_HAS_LOG_COLOR + const char *color = hg_log_colors_g[log_level]; +#endif + hg_time_t tv; + va_list ap; + + if (!(log_level > HG_LOG_LEVEL_NONE && log_level < HG_LOG_LEVEL_MAX)) + return; + + hg_time_get_current(&tv); + level_name = hg_log_level_name_g[log_level]; + stream = hg_log_streams_g[log_level] ? hg_log_streams_g[log_level] : *hg_log_std_streams_g[log_level]; +#ifdef HG_UTIL_HAS_LOG_COLOR + color = hg_log_colors_g[log_level]; +#endif + + va_start(ap, format); + vsnprintf(buf, HG_LOG_BUF_MAX, format, ap); + va_end(ap); + +#ifdef HG_UTIL_HAS_LOG_COLOR + /* Print using logging function */ + hg_log_func_g(stream, + "# %s%s[%lf] %s%s%s->%s%s: %s%s[%s]%s%s %s:%d %s\n" + "## %s%s%s()%s: %s%s%s%s\n", + HG_LOG_REG, HG_LOG_GREEN, hg_time_to_double(tv), HG_LOG_REG, HG_LOG_YELLOW, "mercury", + hg_log_outlet->name, HG_LOG_RESET, HG_LOG_BOLD, color, level_name, HG_LOG_REG, color, file, + line, HG_LOG_RESET, HG_LOG_REG, HG_LOG_YELLOW, func, HG_LOG_RESET, HG_LOG_REG, + log_level != HG_LOG_LEVEL_DEBUG ? color : HG_LOG_RESET, buf, HG_LOG_RESET); +#else + /* Print using logging function */ + hg_log_func_g(stream, + "# [%lf] %s->%s: [%s] %s:%d\n" + " # %s(): %s\n", + hg_time_to_double(tv), "mercury", hg_log_outlet->name, level_name, file, line, func, buf); +#endif + + if (log_level == HG_LOG_LEVEL_ERROR && hg_log_outlet->debug_log && + hg_log_outlet->level >= HG_LOG_LEVEL_MIN_DEBUG) { + hg_dlog_dump(hg_log_outlet->debug_log, hg_log_func_g, stream, 0); + hg_dlog_resetlog(hg_log_outlet->debug_log); + } +} diff --git a/src/mercury/src/util/mercury_log.h b/src/mercury/src/util/mercury_log.h new file mode 100644 index 00000000000..bb1b52fc209 --- /dev/null +++ b/src/mercury/src/util/mercury_log.h @@ -0,0 +1,399 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* + * Copyright (c) 2004, 2005, 2006, 2007 David Young. All rights reserved. + * + * Copyright (c) 2004 Urbana-Champaign Independent Media Center. + * All rights reserved. + * + * + * Portions of hlog are Copyright (c) David Young. The applicable copyright + * notice and licensing terms are reproduced here: + * + * Copyright (c) 2004, 2005, 2006, 2007 David Young. All rights reserved. + * + * This file contains code contributed by David Young. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY DAVID YOUNG ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DAVID + * YOUNG BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * ----------------------------------------------------------------------------- + * ----------------------------------------------------------------------------- + * + * Portions of hlog are Copyright (c) Urbana-Champaign Independent Media Center. + * The applicable copyright notice and licensing terms are reproduced here: + * + * Copyright (c) 2004 Urbana-Champaign Independent Media Center. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE URBANA-CHAMPAIGN INDEPENDENT + * MEDIA CENTER ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE URBANA-CHAMPAIGN INDEPENDENT + * MEDIA CENTER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MERCURY_LOG_H +#define MERCURY_LOG_H + +#include "mercury_dlog.h" +#include "mercury_queue.h" +#include "mercury_util_config.h" + +#include + +/*****************/ +/* Public Macros */ +/*****************/ + +/* For compatibility */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ < 199901L) +#if defined(__GNUC__) && (__GNUC__ >= 2) +#define __func__ __FUNCTION__ +#else +#define __func__ "" +#endif +#elif defined(_WIN32) +#define __func__ __FUNCTION__ +#endif + +/* Cat macro */ +#define HG_UTIL_CAT(x, y) x##y + +/* Stringify macro */ +#define HG_UTIL_STRINGIFY(x) #x + +/* Constructor (used to initialize log outlets) */ +#define HG_UTIL_CONSTRUCTOR __attribute__((constructor)) + +/* Available log levels, additional log levels should be added to that list by + * order of verbosity. Format is: + * - enum type + * - level name + * - default output + * + * error: print error level logs + * warning: print warning level logs + * min_debug: store minimal debug information and defer printing until error + * debug: print debug level logs + */ +#define HG_LOG_LEVELS \ + X(HG_LOG_LEVEL_NONE, "", NULL) /*!< no log */ \ + X(HG_LOG_LEVEL_ERROR, "error", &stderr) /*!< error log type */ \ + X(HG_LOG_LEVEL_WARNING, "warning", &stdout) /*!< warning log type */ \ + X(HG_LOG_LEVEL_MIN_DEBUG, "min_debug", &stdout) /*!< debug log type */ \ + X(HG_LOG_LEVEL_DEBUG, "debug", &stdout) /*!< debug log type */ \ + X(HG_LOG_LEVEL_MAX, "", NULL) + +/* HG_LOG_OUTLET: global variable name of log outlet. */ +#define HG_LOG_OUTLET(name) HG_UTIL_CAT(name, _log_outlet_g) + +/* HG_LOG_OUTLET_DECL: declare an outlet. */ +#define HG_LOG_OUTLET_DECL(name) struct hg_log_outlet HG_LOG_OUTLET(name) + +/* + * HG_LOG_OUTLET_INITIALIZER: initializer for a log in a global variable. + * (parent and debug_log are optional and can be set to NULL) + */ +#define HG_LOG_OUTLET_INITIALIZER(name, state, parent, debug_log) \ + { \ + HG_UTIL_STRINGIFY(name), state, HG_LOG_LEVEL_NONE, parent, debug_log, \ + { \ + NULL \ + } \ + } + +/* HG_LOG_OUTLET_SUBSYS_INITIALIZER: initializer for a sub-system log. */ +#define HG_LOG_OUTLET_SUBSYS_INITIALIZER(name, parent_name) \ + HG_LOG_OUTLET_INITIALIZER(name, HG_LOG_PASS, &HG_LOG_OUTLET(parent_name), NULL) + +/* HG_LOG_OUTLET_SUBSYS_STATE_INITIALIZER: initializer for a sub-system log with + * a defined state. */ +#define HG_LOG_OUTLET_SUBSYS_STATE_INITIALIZER(name, parent_name, state) \ + HG_LOG_OUTLET_INITIALIZER(name, state, &HG_LOG_OUTLET(parent_name), NULL) + +/* HG_LOG_SUBSYS_REGISTER: register a name */ +#define HG_LOG_SUBSYS_REGISTER(name) \ + static void HG_UTIL_CAT(hg_log_outlet_, name)(void) HG_UTIL_CONSTRUCTOR; \ + static void HG_UTIL_CAT(hg_log_outlet_, name)(void) \ + { \ + hg_log_outlet_register(&HG_LOG_OUTLET(name)); \ + } \ + /* Keep unused prototype to use semicolon at end of macro */ \ + void hg_log_outlet_##name##_unused(void) + +/* HG_LOG_SUBSYS_DECL_REGISTER: declare and register a log outlet. */ +#define HG_LOG_SUBSYS_DECL_REGISTER(name, parent_name) \ + struct hg_log_outlet HG_LOG_OUTLET(name) = HG_LOG_OUTLET_SUBSYS_INITIALIZER(name, parent_name); \ + HG_LOG_SUBSYS_REGISTER(name) + +/* HG_LOG_SUBSYS_DECL_STATE_REGISTER: declare and register a log outlet and + * enforce an init state. */ +#define HG_LOG_SUBSYS_DECL_STATE_REGISTER(name, parent_name, state) \ + struct hg_log_outlet HG_LOG_OUTLET(name) = \ + HG_LOG_OUTLET_SUBSYS_STATE_INITIALIZER(name, parent_name, state); \ + HG_LOG_SUBSYS_REGISTER(name) + +/* Log macro */ +#define HG_LOG_WRITE(name, log_level, ...) \ + do { \ + if (HG_LOG_OUTLET(name).level < log_level) \ + break; \ + hg_log_write(&HG_LOG_OUTLET(name), log_level, __FILE__, __LINE__, __func__, __VA_ARGS__); \ + } while (0) + +/* Log macro */ +#define HG_LOG_WRITE_DEBUG(name, debug_func, ...) \ + do { \ + if (HG_LOG_OUTLET(name).level < HG_LOG_LEVEL_MIN_DEBUG) \ + break; \ + if (HG_LOG_OUTLET(name).level >= HG_LOG_LEVEL_MIN_DEBUG && HG_LOG_OUTLET(name).debug_log) \ + hg_dlog_addlog(HG_LOG_OUTLET(name).debug_log, __FILE__, __LINE__, __func__, NULL, NULL); \ + if (HG_LOG_OUTLET(name).level == HG_LOG_LEVEL_DEBUG) { \ + hg_log_write(&HG_LOG_OUTLET(name), HG_LOG_LEVEL_DEBUG, __FILE__, __LINE__, __func__, \ + __VA_ARGS__); \ + debug_func; \ + } \ + } while (0) + +/** + * Additional macros for debug log support. + */ + +/* HG_LOG_DEBUG_DLOG: global variable name of debug log. */ +#define HG_LOG_DEBUG_DLOG(name) HG_UTIL_CAT(name, _dlog_g) + +/* HG_LOG_DEBUG_LE: global variable name of debug log entries. */ +#define HG_LOG_DEBUG_LE(name) HG_UTIL_CAT(name, _dlog_entries_g) + +/* HG_LOG_DEBUG_DECL_DLOG: declare new debug log. */ +#define HG_LOG_DEBUG_DECL_DLOG(name) struct hg_dlog HG_LOG_DEBUG_DLOG(name) + +/* HG_LOG_DEBUG_DECL_LE: declare array of debug log entries. */ +#define HG_LOG_DEBUG_DECL_LE(name, size) struct hg_dlog_entry HG_LOG_DEBUG_LE(name)[size] + +/* HG_LOG_DLOG_INITIALIZER: initializer for a debug log */ +#define HG_LOG_DLOG_INITIALIZER(name, size) \ + HG_DLOG_INITIALIZER(HG_UTIL_STRINGIFY(name), HG_LOG_DEBUG_LE(name), size, 1) + +/* HG_LOG_OUTLET_SUBSYS_DLOG_INITIALIZER: initializer for a sub-system with + * debug log. */ +#define HG_LOG_OUTLET_SUBSYS_DLOG_INITIALIZER(name, parent_name) \ + HG_LOG_OUTLET_INITIALIZER(name, HG_LOG_PASS, &HG_LOG_OUTLET(parent_name), &HG_LOG_DEBUG_DLOG(name)) + +/* HG_LOG_SUBSYS_DLOG_DECL_REGISTER: declare and register a log outlet with + * debug log. */ +#define HG_LOG_SUBSYS_DLOG_DECL_REGISTER(name, parent_name) \ + struct hg_log_outlet HG_LOG_OUTLET(name) = HG_LOG_OUTLET_SUBSYS_DLOG_INITIALIZER(name, parent_name); \ + HG_LOG_SUBSYS_REGISTER(name) + +/* HG_LOG_ADD_COUNTER32: add 32-bit debug log counter */ +#define HG_LOG_ADD_COUNTER32(name, counter_ptr, counter_name, counter_desc) \ + hg_dlog_mkcount32(HG_LOG_OUTLET(name).debug_log, counter_ptr, counter_name, counter_desc) + +/* HG_LOG_ADD_COUNTER64: add 64-bit debug log counter */ +#define HG_LOG_ADD_COUNTER64(name, counter_ptr, counter_name, counter_desc) \ + hg_dlog_mkcount64(HG_LOG_OUTLET(name)->debug_log, counter_ptr, counter_name, counter_desc) + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +#define X(a, b, c) a, +/* Log levels */ +enum hg_log_level { HG_LOG_LEVELS }; +#undef X + +/* Log states */ +enum hg_log_state { HG_LOG_PASS, HG_LOG_OFF, HG_LOG_ON }; + +/* Log outlet */ +struct hg_log_outlet { + const char * name; /* Name of outlet */ + enum hg_log_state state; /* Init state of outlet */ + enum hg_log_level level; /* Level of outlet */ + struct hg_log_outlet *parent; /* Parent of outlet */ + struct hg_dlog * debug_log; /* Debug log to use */ + HG_QUEUE_ENTRY(hg_log_outlet) entry; /* List entry */ +}; + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Set the global log level. + * + * \param log_level [IN] enum log level type + */ +HG_UTIL_PUBLIC void hg_log_set_level(enum hg_log_level log_level); + +/** + * Get the global log level. + * + * \return global log_level + */ +HG_UTIL_PUBLIC enum hg_log_level hg_log_get_level(void); + +/** + * Set the log subsystems from a string. Format is: subsys1,subsys2,... + * Subsys can also be forced to be disabled with "~", e.g., ~subsys1 + * + * \param log_level [IN] null terminated string + */ +HG_UTIL_PUBLIC void hg_log_set_subsys(const char *log_subsys); + +/** + * Get the log subsystems as a string. Format is similar to hg_log_set_subsys(). + * Buffer returned is static. + * + * \return string of enabled log subsystems + */ +HG_UTIL_PUBLIC const char *hg_log_get_subsys(void); + +/** + * Set a specific subsystem's log level. + */ +HG_UTIL_PUBLIC void hg_log_set_subsys_level(const char *subsys, enum hg_log_level log_level); + +/** + * Get the log level from a string. + * + * \param log_level [IN] null terminated string + * + * \return log type enum value + */ +HG_UTIL_PUBLIC enum hg_log_level hg_log_name_to_level(const char *log_level); + +/** + * Set the logging function. + * + * \param log_func [IN] pointer to function + */ +HG_UTIL_PUBLIC void hg_log_set_func(int (*log_func)(FILE *stream, const char *format, ...)); + +/** + * Set the stream for error output. + * + * \param stream [IN/OUT] pointer to stream + */ +HG_UTIL_PUBLIC void hg_log_set_stream_error(FILE *stream); + +/** + * Get the stream for error output. + * + * \return pointer to stream + */ +HG_UTIL_PUBLIC FILE *hg_log_get_stream_error(void); + +/** + * Set the stream for warning output. + * + * \param stream [IN/OUT] pointer to stream + */ +HG_UTIL_PUBLIC void hg_log_set_stream_warning(FILE *stream); + +/** + * Get the stream for warning output. + * + * \return pointer to stream + */ +HG_UTIL_PUBLIC FILE *hg_log_get_stream_warning(void); + +/** + * Set the stream for debug output. + * + * \param stream [IN/OUT] pointer to stream + */ +HG_UTIL_PUBLIC void hg_log_set_stream_debug(FILE *stream); + +/** + * Get the stream for debug output. + * + * \return pointer to stream + */ +HG_UTIL_PUBLIC FILE *hg_log_get_stream_debug(void); + +/** + * Register log outlet. + * + * \param outlet [IN] log outlet + */ +HG_UTIL_PUBLIC void hg_log_outlet_register(struct hg_log_outlet *outlet); + +/** + * Write log. + * + * \param outlet [IN] log outlet + * \param log_level [IN] log level + * \param file [IN] file name + * \param line [IN] line number + * \param func [IN] function name + * \param format [IN] string format + */ +HG_UTIL_PUBLIC void hg_log_write(struct hg_log_outlet *outlet, enum hg_log_level log_level, const char *file, + unsigned int line, const char *func, const char *format, ...) + HG_UTIL_PRINTF_LIKE(6, 7); + +/*********************/ +/* Public Variables */ +/*********************/ + +/* Top error outlet */ +extern HG_UTIL_PUBLIC HG_LOG_OUTLET_DECL(hg); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_LOG_H */ diff --git a/src/mercury/src/util/mercury_mem.c b/src/mercury/src/util/mercury_mem.c new file mode 100644 index 00000000000..ae57cdf64d8 --- /dev/null +++ b/src/mercury/src/util/mercury_mem.c @@ -0,0 +1,191 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_mem.h" + +#include "mercury_util_error.h" + +#ifdef _WIN32 +#include +#else +#include +#include /* For O_* constants */ +#include +#include +#include /* For mode constants */ +#include +#include +#endif +#include + +/*---------------------------------------------------------------------------*/ +long +hg_mem_get_page_size(void) +{ + static long page_size = 0; + + if (page_size == 0) { +#ifdef _WIN32 + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + page_size = system_info.dwPageSize; +#else + page_size = sysconf(_SC_PAGE_SIZE); +#endif + } + + return page_size; +} + +/*---------------------------------------------------------------------------*/ +void * +hg_mem_aligned_alloc(size_t alignment, size_t size) +{ + void *mem_ptr = NULL; + +#ifdef _WIN32 + mem_ptr = _aligned_malloc(size, alignment); +#else +#ifdef _ISOC11_SOURCE + mem_ptr = aligned_alloc(alignment, size); +#else + int rc = posix_memalign(&mem_ptr, alignment, size); + if (rc != 0) + return NULL; +#endif +#endif + + return mem_ptr; +} + +/*---------------------------------------------------------------------------*/ +void +hg_mem_aligned_free(void *mem_ptr) +{ +#ifdef _WIN32 + _aligned_free(mem_ptr); +#else + free(mem_ptr); +#endif +} + +/*---------------------------------------------------------------------------*/ +void * +hg_mem_header_alloc(size_t header_size, size_t alignment, size_t size) +{ + const size_t pad = + (alignment == 0 || header_size % alignment == 0) ? 0 : alignment - header_size % alignment; + + return (char *)malloc(header_size + pad + size) + header_size + pad; +} + +/*---------------------------------------------------------------------------*/ +void +hg_mem_header_free(size_t header_size, size_t alignment, void *mem_ptr) +{ + const size_t pad = + (alignment == 0 || header_size % alignment == 0) ? 0 : alignment - header_size % alignment; + + free((char *)mem_ptr - header_size - pad); +} + +/*---------------------------------------------------------------------------*/ +void * +hg_mem_shm_map(const char *name, size_t size, hg_util_bool_t create) +{ + void *mem_ptr = NULL; +#ifdef _WIN32 + HANDLE fd = INVALID_HANDLE_VALUE; + LARGE_INTEGER large = {.QuadPart = size}; + DWORD access = FILE_MAP_READ | FILE_MAP_WRITE; + BOOL rc; + + if (create) { + fd = CreateFileMappingA(INVALID_HANDLE_VALUE, 0, PAGE_READWRITE, large.HighPart, large.LowPart, name); + HG_UTIL_CHECK_ERROR_NORET(!fd, error, "CreateFileMappingA() failed"); + } + else { + fd = OpenFileMappingA(access, FALSE, name); + HG_UTIL_CHECK_ERROR_NORET(!fd, error, "OpenFileMappingA() failed"); + } + + mem_ptr = MapViewOfFile(fd, access, 0, 0, size); + HG_UTIL_CHECK_ERROR_NORET(!mem_ptr, error, "MapViewOfFile() failed"); + + /* The handle can be closed without affecting the memory mapping */ + rc = CloseHandle(fd); + HG_UTIL_CHECK_ERROR_NORET(!rc, error, "CloseHandle() failed"); +#else + int fd = 0; + int flags = O_RDWR | (create ? O_CREAT : 0); + struct stat shm_stat; + int rc; + + fd = shm_open(name, flags, S_IRUSR | S_IWUSR); + HG_UTIL_CHECK_ERROR_NORET(fd < 0, error, "shm_open() failed (%s)", strerror(errno)); + + rc = fstat(fd, &shm_stat); + HG_UTIL_CHECK_ERROR_NORET(rc != 0, error, "fstat() failed (%s)", strerror(errno)); + + if (shm_stat.st_size == 0) { + rc = ftruncate(fd, (off_t)size); + HG_UTIL_CHECK_ERROR_NORET(rc != 0, error, "ftruncate() failed (%s)", strerror(errno)); + } + else + HG_UTIL_CHECK_ERROR_NORET(shm_stat.st_size < (off_t)size, error, "shm file size too small"); + + mem_ptr = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); + HG_UTIL_CHECK_ERROR_NORET(mem_ptr == MAP_FAILED, error, "mmap() failed (%s)", strerror(errno)); + + /* The file descriptor can be closed without affecting the memory mapping */ + rc = close(fd); + HG_UTIL_CHECK_ERROR_NORET(rc != 0, error, "close() failed (%s)", strerror(errno)); +#endif + + return mem_ptr; + +error: +#ifdef _WIN32 + if (fd) + CloseHandle(fd); +#else + if (fd > 0) + close(fd); +#endif + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +int +hg_mem_shm_unmap(const char *name, void *mem_ptr, size_t size) +{ + int ret = HG_UTIL_SUCCESS; + +#ifdef _WIN32 + if (mem_ptr) { + BOOL rc = UnmapViewOfFile(mem_ptr); + HG_UTIL_CHECK_ERROR(!rc, done, ret, HG_UTIL_FAIL, "UnmapViewOfFile() failed"); + } +#else + if (mem_ptr && mem_ptr != MAP_FAILED) { + int rc = munmap(mem_ptr, size); + HG_UTIL_CHECK_ERROR(rc != 0, done, ret, HG_UTIL_FAIL, "munmap() failed (%s)", strerror(errno)); + } + + if (name) { + int rc = shm_unlink(name); + HG_UTIL_CHECK_ERROR(rc != 0, done, ret, HG_UTIL_FAIL, "shm_unlink() failed (%s)", strerror(errno)); + } +#endif + +done: + return ret; +} diff --git a/src/mercury/src/util/mercury_mem.h b/src/mercury/src/util/mercury_mem.h new file mode 100644 index 00000000000..3c15c01f90d --- /dev/null +++ b/src/mercury/src/util/mercury_mem.h @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_MEM_H +#define MERCURY_MEM_H + +#include "mercury_util_config.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/*****************/ +/* Public Macros */ +/*****************/ + +#define HG_MEM_CACHE_LINE_SIZE 64 +#define HG_MEM_PAGE_SIZE 4096 + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Get system default page size. + * + * \return page size on success or negative on failure + */ +HG_UTIL_PUBLIC long hg_mem_get_page_size(void); + +/** + * Allocate size bytes and return a pointer to the allocated memory. + * The memory address will be a multiple of alignment, which must be a power of + * two, and size should be a multiple of alignment. + * + * \param alignment [IN] alignment size + * \param size [IN] total requested size + * + * \return a pointer to the allocated memory, or NULL in case of failure + */ +HG_UTIL_PUBLIC void *hg_mem_aligned_alloc(size_t alignment, size_t size); + +/** + * Free memory allocated from hg_aligned_alloc(). + * + * \param mem_ptr [IN] pointer to allocated memory + */ +HG_UTIL_PUBLIC void hg_mem_aligned_free(void *mem_ptr); + +/** + * Allocate a buffer with a `size`-bytes, `alignment`-aligned payload + * preceded by a `header_size` header, padding the allocation with up + * to `alignment - 1` bytes to ensure that the payload is properly aligned. + * + * If `alignment` is 0, do not try to align the payload. It's ok if + * `size` is 0, however, behavior is undefined if both `header_size` + * and `size` are 0. + * + * \param header_size [IN] size of header + * \param alignment [IN] alignment size + * \param size [IN] requested payload size + * + * \return a pointer to the payload or NULL on failure + */ +HG_UTIL_PUBLIC void *hg_mem_header_alloc(size_t header_size, size_t alignment, size_t size); + +/** + * Free the memory that was returned previously by a call to + * `hg_mem_header_alloc()`. + * + * \param header_size [IN] size of header + * \param alignment [IN] alignment size + * \param mem_ptr [IN] memory pointer + */ +HG_UTIL_PUBLIC void hg_mem_header_free(size_t header_size, size_t alignment, void *mem_ptr); + +/** + * Create/open a shared-memory mapped file of size \size with name \name. + * + * \param name [IN] name of mapped file + * \param size [IN] total requested size + * \param create [IN] create file if not existing + * + * \return a pointer to the mapped memory region, or NULL in case of failure + */ +HG_UTIL_PUBLIC void *hg_mem_shm_map(const char *name, size_t size, hg_util_bool_t create); + +/** + * Unmap a previously mapped region and close the file. + * + * \param name [IN] name of mapped file + * \param mem_ptr [IN] pointer to mapped memory region + * \param size [IN] size range of the mapped region + * + * \return non-negative on success, or negative in case of failure + */ +HG_UTIL_PUBLIC int hg_mem_shm_unmap(const char *name, void *mem_ptr, size_t size); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_MEM_H */ diff --git a/src/mercury/src/util/mercury_mem_pool.c b/src/mercury/src/util/mercury_mem_pool.c new file mode 100644 index 00000000000..d97b438be34 --- /dev/null +++ b/src/mercury/src/util/mercury_mem_pool.c @@ -0,0 +1,337 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_mem_pool.h" + +#include "mercury_mem.h" +#include "mercury_queue.h" +#include "mercury_thread_condition.h" +#include "mercury_thread_mutex.h" +#include "mercury_thread_spin.h" +#include "mercury_util_error.h" + +#include +#include + +/****************/ +/* Local Macros */ +/****************/ + +/** + * container_of - cast a member of a structure done to the containing structure + * \ptr: the pointer to the member. + * \type: the type of the container struct this is embedded in. + * \member: the name of the member within the struct. + * + */ +#if !defined(container_of) +#define container_of(ptr, type, member) ((type *)((char *)ptr - offsetof(type, member))) +#endif + +/************************************/ +/* Local Type and Struct Definition */ +/************************************/ + +/** + * Memory chunk (points to actual data). + */ +struct hg_mem_pool_chunk { + HG_QUEUE_ENTRY(hg_mem_pool_chunk) entry; /* Entry in chunk_list */ + char *chunk; /* Must be last */ +}; + +/** + * Memory block. Each block has a fixed chunk size, the underlying memory + * buffer is registered. + */ +struct hg_mem_pool_block { + HG_QUEUE_HEAD(hg_mem_pool_chunk) chunks; /* Chunk list */ + HG_QUEUE_ENTRY(hg_mem_pool_block) entry; /* Entry in block list */ + void * mr_handle; /* Pointer to MR handle */ + hg_thread_spin_t chunk_lock; /* Chunk list lock */ +}; + +/** + * Memory pool. A pool is composed of multiple blocks. + */ +struct hg_mem_pool { + hg_thread_mutex_t extend_mutex; /* Extend mutex */ + hg_thread_cond_t extend_cond; /* Extend cond */ + HG_QUEUE_HEAD(hg_mem_pool_block) blocks; /* Block list */ + hg_mem_pool_register_func_t register_func; /* Register func */ + hg_mem_pool_deregister_func_t deregister_func; /* Deregister func */ + void * arg; /* Func args */ + size_t chunk_size; /* Chunk size */ + size_t chunk_count; /* Chunk count */ + int extending; /* Extending pool */ + hg_thread_spin_t block_lock; /* Block list lock */ +}; + +/********************/ +/* Local Prototypes */ +/********************/ + +/* Allocate new pool block */ +static struct hg_mem_pool_block *hg_mem_pool_block_alloc(size_t chunk_size, size_t chunk_count, + hg_mem_pool_register_func_t register_func, + void * arg); + +/* Free pool block */ +static void hg_mem_pool_block_free(struct hg_mem_pool_block * hg_mem_pool_block, + hg_mem_pool_deregister_func_t deregister_func, void *arg); + +/*******************/ +/* Local Variables */ +/*******************/ + +/*---------------------------------------------------------------------------*/ + +struct hg_mem_pool * +hg_mem_pool_create(size_t chunk_size, size_t chunk_count, size_t block_count, + hg_mem_pool_register_func_t register_func, hg_mem_pool_deregister_func_t deregister_func, + void *arg) +{ + struct hg_mem_pool *hg_mem_pool = NULL; + size_t i; + + hg_mem_pool = (struct hg_mem_pool *)malloc(sizeof(struct hg_mem_pool)); + HG_UTIL_CHECK_ERROR_NORET(hg_mem_pool == NULL, done, "Could not allocate memory pool"); + HG_QUEUE_INIT(&hg_mem_pool->blocks); + hg_mem_pool->register_func = register_func; + hg_mem_pool->deregister_func = deregister_func; + hg_mem_pool->arg = arg; + hg_mem_pool->chunk_size = chunk_size; + hg_mem_pool->chunk_count = chunk_count; + hg_thread_mutex_init(&hg_mem_pool->extend_mutex); + hg_thread_cond_init(&hg_mem_pool->extend_cond); + hg_thread_spin_init(&hg_mem_pool->block_lock); + hg_mem_pool->extending = 0; + + /* Allocate single block */ + for (i = 0; i < block_count; i++) { + struct hg_mem_pool_block *hg_mem_pool_block = + hg_mem_pool_block_alloc(chunk_size, chunk_count, register_func, arg); + HG_UTIL_CHECK_ERROR_NORET(hg_mem_pool_block == NULL, error, "Could not allocate block of %zu bytes", + chunk_size * chunk_count); + HG_QUEUE_PUSH_TAIL(&hg_mem_pool->blocks, hg_mem_pool_block, entry); + } + +done: + return hg_mem_pool; + +error: + hg_mem_pool_destroy(hg_mem_pool); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +void +hg_mem_pool_destroy(struct hg_mem_pool *hg_mem_pool) +{ + if (!hg_mem_pool) + return; + + while (!HG_QUEUE_IS_EMPTY(&hg_mem_pool->blocks)) { + struct hg_mem_pool_block *hg_mem_pool_block = HG_QUEUE_FIRST(&hg_mem_pool->blocks); + HG_QUEUE_POP_HEAD(&hg_mem_pool->blocks, entry); + hg_mem_pool_block_free(hg_mem_pool_block, hg_mem_pool->deregister_func, hg_mem_pool->arg); + } + hg_thread_mutex_destroy(&hg_mem_pool->extend_mutex); + hg_thread_cond_destroy(&hg_mem_pool->extend_cond); + hg_thread_spin_destroy(&hg_mem_pool->block_lock); + free(hg_mem_pool); +} + +/*---------------------------------------------------------------------------*/ +static struct hg_mem_pool_block * +hg_mem_pool_block_alloc(size_t chunk_size, size_t chunk_count, hg_mem_pool_register_func_t register_func, + void *arg) +{ + struct hg_mem_pool_block *hg_mem_pool_block = NULL; + size_t page_size = (size_t)hg_mem_get_page_size(); + void * mem_ptr = NULL, *mr_handle = NULL; + size_t block_size, i; + size_t block_header = sizeof(struct hg_mem_pool_block); + size_t chunk_header = offsetof(struct hg_mem_pool_chunk, chunk); + + /* Size of block struct + number of chunks x (chunk_size + size of entry) */ + block_size = block_header + chunk_count * (chunk_header + chunk_size); + + /* Allocate backend buffer */ + mem_ptr = hg_mem_aligned_alloc(page_size, block_size); + HG_UTIL_CHECK_ERROR_NORET(mem_ptr == NULL, done, "Could not allocate %zu bytes", block_size); + memset(mem_ptr, 0, block_size); + + /* Register memory if registration function is provided */ + if (register_func) { + int rc = register_func(mem_ptr, block_size, &mr_handle, arg); + if (unlikely(rc != HG_UTIL_SUCCESS)) { + hg_mem_aligned_free(mem_ptr); + HG_UTIL_GOTO_ERROR(done, mem_ptr, NULL, "register_func() failed"); + } + } + + /* Map allocated memory to block */ + hg_mem_pool_block = (struct hg_mem_pool_block *)mem_ptr; + + HG_QUEUE_INIT(&hg_mem_pool_block->chunks); + hg_thread_spin_init(&hg_mem_pool_block->chunk_lock); + hg_mem_pool_block->mr_handle = mr_handle; + + /* Assign chunks and insert them to free list */ + for (i = 0; i < chunk_count; i++) { + struct hg_mem_pool_chunk *hg_mem_pool_chunk = + (struct hg_mem_pool_chunk *)((char *)hg_mem_pool_block + block_header + + i * (chunk_header + chunk_size)); + HG_QUEUE_PUSH_TAIL(&hg_mem_pool_block->chunks, hg_mem_pool_chunk, entry); + } + +done: + return hg_mem_pool_block; +} + +/*---------------------------------------------------------------------------*/ +static void +hg_mem_pool_block_free(struct hg_mem_pool_block * hg_mem_pool_block, + hg_mem_pool_deregister_func_t deregister_func, void *arg) +{ + if (!hg_mem_pool_block) + return; + + /* Release MR handle is there was any */ + if (hg_mem_pool_block->mr_handle && deregister_func) { + int rc = deregister_func(hg_mem_pool_block->mr_handle, arg); + HG_UTIL_CHECK_ERROR_NORET(rc != HG_UTIL_SUCCESS, done, "deregister_func() failed"); + } + +done: + hg_thread_spin_destroy(&hg_mem_pool_block->chunk_lock); + hg_mem_aligned_free((void *)hg_mem_pool_block); + return; +} + +/*---------------------------------------------------------------------------*/ +void * +hg_mem_pool_alloc(struct hg_mem_pool *hg_mem_pool, size_t size, void **mr_handle) +{ + struct hg_mem_pool_block *hg_mem_pool_block; + struct hg_mem_pool_chunk *hg_mem_pool_chunk = NULL; + void * mem_ptr = NULL; + + HG_UTIL_CHECK_ERROR(size > hg_mem_pool->chunk_size, done, mem_ptr, NULL, + "Chunk size is too small for requested size"); + HG_UTIL_CHECK_ERROR(!mr_handle && hg_mem_pool->register_func, done, mem_ptr, NULL, "MR handle is NULL"); + + do { + int found = 0; + + /* Check whether we can get a block from one of the pools */ + hg_thread_spin_lock(&hg_mem_pool->block_lock); + HG_QUEUE_FOREACH(hg_mem_pool_block, &hg_mem_pool->blocks, entry) + { + hg_thread_spin_lock(&hg_mem_pool_block->chunk_lock); + found = !HG_QUEUE_IS_EMPTY(&hg_mem_pool_block->chunks); + hg_thread_spin_unlock(&hg_mem_pool_block->chunk_lock); + if (found) + break; + } + hg_thread_spin_unlock(&hg_mem_pool->block_lock); + + /* If not, allocate and register a new pool */ + if (!found) { + /* Let other threads sleep while the pool is being extended */ + hg_thread_mutex_lock(&hg_mem_pool->extend_mutex); + if (hg_mem_pool->extending) { + hg_thread_cond_wait(&hg_mem_pool->extend_cond, &hg_mem_pool->extend_mutex); + hg_thread_mutex_unlock(&hg_mem_pool->extend_mutex); + continue; + } + hg_mem_pool->extending = 1; + hg_thread_mutex_unlock(&hg_mem_pool->extend_mutex); + + hg_mem_pool_block = hg_mem_pool_block_alloc(hg_mem_pool->chunk_size, hg_mem_pool->chunk_count, + hg_mem_pool->register_func, hg_mem_pool->arg); + HG_UTIL_CHECK_ERROR(hg_mem_pool_block == NULL, done, mem_ptr, NULL, + "Could not allocate block of %zu bytes", + hg_mem_pool->chunk_size * hg_mem_pool->chunk_count); + + hg_thread_spin_lock(&hg_mem_pool->block_lock); + HG_QUEUE_PUSH_TAIL(&hg_mem_pool->blocks, hg_mem_pool_block, entry); + hg_thread_spin_unlock(&hg_mem_pool->block_lock); + + hg_thread_mutex_lock(&hg_mem_pool->extend_mutex); + hg_mem_pool->extending = 0; + hg_thread_cond_broadcast(&hg_mem_pool->extend_cond); + hg_thread_mutex_unlock(&hg_mem_pool->extend_mutex); + } + + /* Try to pick a node from one of the available pools */ + hg_thread_spin_lock(&hg_mem_pool_block->chunk_lock); + if (!HG_QUEUE_IS_EMPTY(&hg_mem_pool_block->chunks)) { + hg_mem_pool_chunk = HG_QUEUE_FIRST(&hg_mem_pool_block->chunks); + HG_QUEUE_POP_HEAD(&hg_mem_pool_block->chunks, entry); + } + hg_thread_spin_unlock(&hg_mem_pool_block->chunk_lock); + } while (!hg_mem_pool_chunk); + + mem_ptr = &hg_mem_pool_chunk->chunk; + if (mr_handle && hg_mem_pool_block) + *mr_handle = hg_mem_pool_block->mr_handle; + +done: + return mem_ptr; +} + +/*---------------------------------------------------------------------------*/ +void +hg_mem_pool_free(struct hg_mem_pool *hg_mem_pool, void *mem_ptr, void *mr_handle) +{ + struct hg_mem_pool_block *hg_mem_pool_block; + int found = 0; + + if (!mem_ptr) + return; + + /* Put the node back to the pool */ + hg_thread_spin_lock(&hg_mem_pool->block_lock); + HG_QUEUE_FOREACH(hg_mem_pool_block, &hg_mem_pool->blocks, entry) + { + /* If MR handle is NULL, it does not really matter which pool we push + * the node back to. + */ + if (hg_mem_pool_block->mr_handle == mr_handle) { + struct hg_mem_pool_chunk *hg_mem_pool_chunk = + container_of(mem_ptr, struct hg_mem_pool_chunk, chunk); + hg_thread_spin_lock(&hg_mem_pool_block->chunk_lock); + HG_QUEUE_PUSH_TAIL(&hg_mem_pool_block->chunks, hg_mem_pool_chunk, entry); + hg_thread_spin_unlock(&hg_mem_pool_block->chunk_lock); + found = 1; + break; + } + } + hg_thread_spin_unlock(&hg_mem_pool->block_lock); + + HG_UTIL_CHECK_WARNING(found != 1, "Memory block was not found"); +} + +/*---------------------------------------------------------------------------*/ +size_t +hg_mem_pool_chunk_offset(struct hg_mem_pool *hg_mem_pool, void *mem_ptr, void *mr_handle) +{ + struct hg_mem_pool_block *hg_mem_pool_block; + + hg_thread_spin_lock(&hg_mem_pool->block_lock); + HG_QUEUE_FOREACH(hg_mem_pool_block, &hg_mem_pool->blocks, entry) + if (hg_mem_pool_block->mr_handle == mr_handle) + break; + hg_thread_spin_unlock(&hg_mem_pool->block_lock); + + return (size_t)((char *)mem_ptr - (char *)hg_mem_pool_block); +} diff --git a/src/mercury/src/util/mercury_mem_pool.h b/src/mercury/src/util/mercury_mem_pool.h new file mode 100644 index 00000000000..d2acfdd6e7f --- /dev/null +++ b/src/mercury/src/util/mercury_mem_pool.h @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_MEM_POOL_H +#define MERCURY_MEM_POOL_H + +#include "mercury_util_config.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/** + * Register memory block. + * + * \param buf [IN] pointer to buffer + * \param size [IN] buffer size + * \param handle [OUT] handle + * \param arg [IN/OUT] optional arguments + * + * \return HG_UTIL_SUCCESS if successful / error code otherwise + */ +typedef int (*hg_mem_pool_register_func_t)(const void *buf, size_t size, void **handle, void *arg); + +/** + * Deregister memory block. + * + * \param handle [IN/OUT] handle + * \param arg [IN/OUT] optional arguments + * + * \return HG_UTIL_SUCCESS if successful / error code otherwise + */ +typedef int (*hg_mem_pool_deregister_func_t)(void *handle, void *arg); + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Create a memory pool with \block_count of size \chunk_count x \chunk_size + * bytes. Optionally register and deregister memory for each block using + * \register_func and \deregister_func respectively. + * + * \param chunk_size [IN] size of chunks + * \param chunk_count [IN] number of chunks + * \param block_count [IN] number of blocks + * \param register_func [IN] pointer to register function + * \param deregister_func [IN] pointer to deregister function + * \param arg [IN/OUT] optional arguments passed to register functions + * + * \return HG_UTIL_SUCCESS if successful / error code otherwise + */ +HG_UTIL_PUBLIC struct hg_mem_pool *hg_mem_pool_create(size_t chunk_size, size_t chunk_count, + size_t block_count, + hg_mem_pool_register_func_t register_func, + hg_mem_pool_deregister_func_t deregister_func, + void * arg); + +/** + * Destroy a memory pool. + * + * \param hg_mem_pool [IN/OUT] pointer to memory pool + * + */ +HG_UTIL_PUBLIC void hg_mem_pool_destroy(struct hg_mem_pool *hg_mem_pool); + +/** + * Allocate \size bytes and optionally return a memory handle + * \mr_handle if registration functions were provided. + * + * \param hg_mem_pool [IN/OUT] pointer to memory pool + * \param size [IN] requested size + * \param mr_handle [OUT] pointer to memory handle + * + * \return pointer to memory block + */ +HG_UTIL_PUBLIC void *hg_mem_pool_alloc(struct hg_mem_pool *hg_mem_pool, size_t size, void **mr_handle); + +/** + * Release memory at address \mem_ptr. + * + * \param hg_mem_pool [IN/OUT] pointer to memory pool + * \param mem_ptr [IN] pointer to memory + * \param mr_handle [INT] pointer to memory handle + * + */ +HG_UTIL_PUBLIC void hg_mem_pool_free(struct hg_mem_pool *hg_mem_pool, void *mem_ptr, void *mr_handle); + +/** + * Retrieve chunk offset relative to the address used for registering + * the memory block it belongs to. + * + * \param hg_mem_pool [IN/OUT] pointer to memory pool + * \param mem_ptr [IN] pointer to memory + * \param mr_handle [INT] pointer to memory handle + * + * \return offset within registered block. + */ +HG_UTIL_PUBLIC size_t hg_mem_pool_chunk_offset(struct hg_mem_pool *hg_mem_pool, void *mem_ptr, + void *mr_handle); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_MEM_POOL_H */ diff --git a/src/mercury/src/util/mercury_poll.c b/src/mercury/src/util/mercury_poll.c new file mode 100644 index 00000000000..eb54a825f25 --- /dev/null +++ b/src/mercury/src/util/mercury_poll.c @@ -0,0 +1,493 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_poll.h" +#include "mercury_event.h" +#include "mercury_thread_mutex.h" +#include "mercury_util_error.h" + +#include + +#if defined(_WIN32) +/* TODO */ +#else +#include +#include +#include +#if defined(HG_UTIL_HAS_SYSEPOLL_H) +#include +#elif defined(HG_UTIL_HAS_SYSEVENT_H) +#include +#include +#else +#include +#endif +#endif /* defined(_WIN32) */ + +/****************/ +/* Local Macros */ +/****************/ + +#define HG_POLL_INIT_NEVENTS 32 +#define HG_POLL_MAX_EVENTS 4096 + +#ifndef MIN +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) +#endif + +/************************************/ +/* Local Type and Struct Definition */ +/************************************/ + +struct hg_poll_set { + hg_thread_mutex_t lock; +#if defined(HG_UTIL_HAS_SYSEPOLL_H) + struct epoll_event *events; +#elif defined(HG_UTIL_HAS_SYSEVENT_H) + struct kevent *events; +#else + struct pollfd * events; + hg_poll_data_t *event_data; +#endif + unsigned int max_events; + unsigned int nfds; + int fd; +}; + +/********************/ +/* Local Prototypes */ +/********************/ + +/*******************/ +/* Local Variables */ +/*******************/ + +/*---------------------------------------------------------------------------*/ +hg_poll_set_t * +hg_poll_create(void) +{ + struct hg_poll_set *hg_poll_set = NULL; + + hg_poll_set = malloc(sizeof(struct hg_poll_set)); + HG_UTIL_CHECK_ERROR_NORET(hg_poll_set == NULL, error, "malloc() failed (%s)", strerror(errno)); + + hg_thread_mutex_init(&hg_poll_set->lock); + hg_poll_set->nfds = 0; + hg_poll_set->max_events = HG_POLL_INIT_NEVENTS; + + /* Preallocate events, size will grow as needed */ + hg_poll_set->events = malloc(sizeof(*hg_poll_set->events) * hg_poll_set->max_events); + HG_UTIL_CHECK_ERROR_NORET(!hg_poll_set->events, error, "malloc() failed (%s)", strerror(errno)); + +#if defined(_WIN32) + /* TODO */ +#elif defined(HG_UTIL_HAS_SYSEPOLL_H) + hg_poll_set->fd = epoll_create1(0); + HG_UTIL_CHECK_ERROR_NORET(hg_poll_set->fd == -1, error, "epoll_create1() failed (%s)", strerror(errno)); +#elif defined(HG_UTIL_HAS_SYSEVENT_H) + hg_poll_set->fd = kqueue(); + HG_UTIL_CHECK_ERROR_NORET(hg_poll_set->fd == -1, error, "kqueue() failed (%s)", strerror(errno)); +#else + hg_poll_set->fd = hg_event_create(); + HG_UTIL_CHECK_ERROR_NORET(hg_poll_set->fd == -1, error, "hg_event_create() failed (%s)", strerror(errno)); + + /* Preallocate event_data, size will grow as needed */ + hg_poll_set->event_data = malloc(sizeof(*hg_poll_set->event_data) * hg_poll_set->max_events); + HG_UTIL_CHECK_ERROR_NORET(!hg_poll_set->events, error, "malloc() failed (%s)", strerror(errno)); +#endif + HG_UTIL_LOG_DEBUG("Created new poll set, fd=%d", hg_poll_set->fd); + + return hg_poll_set; + +error: + if (hg_poll_set) { + free(hg_poll_set->events); + hg_thread_mutex_destroy(&hg_poll_set->lock); + free(hg_poll_set); + } + return NULL; +} + +/*---------------------------------------------------------------------------*/ +int +hg_poll_destroy(hg_poll_set_t *poll_set) +{ + int ret = HG_UTIL_SUCCESS; + int rc; + + if (!poll_set) + goto done; + + HG_UTIL_CHECK_ERROR(poll_set->nfds > 0, done, ret, HG_UTIL_FAIL, "Poll set non empty"); + + HG_UTIL_LOG_DEBUG("Destroying poll set, fd=%d", poll_set->fd); + +#if defined(_WIN32) + /* TODO */ +#elif defined(HG_UTIL_HAS_SYSEPOLL_H) || defined(HG_UTIL_HAS_SYSEVENT_H) + /* Close poll descriptor */ + rc = close(poll_set->fd); + HG_UTIL_CHECK_ERROR(rc == -1, done, ret, HG_UTIL_FAIL, "close() failed (%s)", strerror(errno)); +#else + rc = hg_event_destroy(poll_set->fd); + HG_UTIL_CHECK_ERROR(rc == HG_UTIL_FAIL, done, ret, HG_UTIL_FAIL, "hg_event_destroy() failed (%s)", + strerror(errno)); +#endif + + hg_thread_mutex_destroy(&poll_set->lock); +#if !defined(_WIN32) && !defined(HG_UTIL_HAS_SYSEPOLL_H) && !defined(HG_UTIL_HAS_SYSEVENT_H) + free(poll_set->event_data); +#endif + free(poll_set->events); + free(poll_set); + +done: + return ret; +} + +/*---------------------------------------------------------------------------*/ +int +hg_poll_get_fd(hg_poll_set_t *poll_set) +{ +#if defined(_WIN32) + /* TODO */ + return -1; +#else + return poll_set->fd; +#endif +} + +/*---------------------------------------------------------------------------*/ +int +hg_poll_add(hg_poll_set_t *poll_set, int fd, struct hg_poll_event *event) +{ +#if defined(_WIN32) + /* TODO */ +#elif defined(HG_UTIL_HAS_SYSEPOLL_H) + struct epoll_event ev; + uint32_t poll_flags = 0; + int rc; +#elif defined(HG_UTIL_HAS_SYSEVENT_H) + struct kevent ev; + struct timespec timeout = {0, 0}; + int16_t poll_flags = 0; + int rc; +#else + struct pollfd ev; + short int poll_flags = 0; +#endif + int ret = HG_UTIL_SUCCESS; + + HG_UTIL_LOG_DEBUG("Adding fd=%d to poll set (fd=%d)", fd, poll_set->fd); + +#if defined(_WIN32) + /* TODO */ +#elif defined(HG_UTIL_HAS_SYSEPOLL_H) + /* Translate flags */ + if (event->events & HG_POLLIN) + poll_flags |= EPOLLIN; + if (event->events & HG_POLLOUT) + poll_flags |= EPOLLOUT; + + ev.events = poll_flags; + ev.data.u64 = (uint64_t)event->data.u64; + + rc = epoll_ctl(poll_set->fd, EPOLL_CTL_ADD, fd, &ev); + HG_UTIL_CHECK_ERROR(rc != 0, done, ret, HG_UTIL_FAIL, "epoll_ctl() failed (%s)", strerror(errno)); +#elif defined(HG_UTIL_HAS_SYSEVENT_H) + /* Translate flags */ + if (event->events & HG_POLLIN) + poll_flags |= EVFILT_READ; + if (event->events & HG_POLLOUT) + poll_flags |= EVFILT_WRITE; + + EV_SET(&ev, (uintptr_t)fd, poll_flags, EV_ADD, 0, 0, event->data.ptr); + + rc = kevent(poll_set->fd, &ev, 1, NULL, 0, &timeout); + HG_UTIL_CHECK_ERROR(rc == -1, done, ret, HG_UTIL_FAIL, "kevent() failed (%s)", strerror(errno)); +#else + /* Translate flags */ + if (event->events & HG_POLLIN) + poll_flags |= POLLIN; + if (event->events & HG_POLLOUT) + poll_flags |= POLLOUT; + + ev.fd = fd; + ev.events = poll_flags; + ev.revents = 0; +#endif + + hg_thread_mutex_lock(&poll_set->lock); + +#if !defined(_WIN32) && !defined(HG_UTIL_HAS_SYSEPOLL_H) && !defined(HG_UTIL_HAS_SYSEVENT_H) + /* Grow array if reached max number */ + if (poll_set->nfds == poll_set->max_events) { + HG_UTIL_CHECK_ERROR(poll_set->max_events * 2 > HG_POLL_MAX_EVENTS, unlock, ret, HG_UTIL_FAIL, + "reached max number of events for this poll set (%d)", poll_set->max_events); + + poll_set->events = realloc(poll_set->events, sizeof(*poll_set->events) * poll_set->max_events * 2); + HG_UTIL_CHECK_ERROR(!poll_set->events, unlock, ret, HG_UTIL_FAIL, "realloc() failed (%s)", + strerror(errno)); + + poll_set->event_data = + realloc(poll_set->event_data, sizeof(*poll_set->event_data) * poll_set->max_events * 2); + HG_UTIL_CHECK_ERROR(!poll_set->event_data, unlock, ret, HG_UTIL_FAIL, "realloc() failed (%s)", + strerror(errno)); + + poll_set->max_events *= 2; + } + poll_set->events[poll_set->nfds] = ev; + poll_set->event_data[poll_set->nfds] = event->data; +#endif + poll_set->nfds++; + +#if !defined(_WIN32) && !defined(HG_UTIL_HAS_SYSEPOLL_H) && !defined(HG_UTIL_HAS_SYSEVENT_H) +unlock: +#endif + hg_thread_mutex_unlock(&poll_set->lock); + +done: + return ret; +} + +/*---------------------------------------------------------------------------*/ +int +hg_poll_remove(hg_poll_set_t *poll_set, int fd) +{ +#if defined(_WIN32) + /* TODO */ +#elif defined(HG_UTIL_HAS_SYSEPOLL_H) + int rc; +#elif defined(HG_UTIL_HAS_SYSEVENT_H) + struct kevent ev; + struct timespec timeout = {0, 0}; + int rc; +#else + int i, found = -1; +#endif + int ret = HG_UTIL_SUCCESS; + + HG_UTIL_LOG_DEBUG("Removing fd=%d from poll set (fd=%d)", fd, poll_set->fd); + +#if defined(_WIN32) + /* TODO */ +#elif defined(HG_UTIL_HAS_SYSEPOLL_H) + rc = epoll_ctl(poll_set->fd, EPOLL_CTL_DEL, fd, NULL); + HG_UTIL_CHECK_ERROR(rc != 0, done, ret, HG_UTIL_FAIL, "epoll_ctl() failed (%s)", strerror(errno)); + hg_thread_mutex_lock(&poll_set->lock); +#elif defined(HG_UTIL_HAS_SYSEVENT_H) + /* Events which are attached to file descriptors are automatically + * deleted on the last close of the descriptor. */ + EV_SET(&ev, (uintptr_t)fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); + rc = kevent(poll_set->fd, &ev, 1, NULL, 0, &timeout); + HG_UTIL_CHECK_ERROR(rc == -1, done, ret, HG_UTIL_FAIL, "kevent() failed (%s)", strerror(errno)); + hg_thread_mutex_lock(&poll_set->lock); +#else + hg_thread_mutex_lock(&poll_set->lock); + for (i = 0; i < (int)poll_set->nfds; i++) { + if (poll_set->events[i].fd == fd) { + found = i; + break; + } + } + HG_UTIL_CHECK_ERROR(found < 0, error, ret, HG_UTIL_FAIL, "Could not find fd in poll_set"); + + for (i = found; i < (int)poll_set->nfds - 1; i++) { + poll_set->events[i] = poll_set->events[i + 1]; + poll_set->event_data[i] = poll_set->event_data[i + 1]; + } +#endif + poll_set->nfds--; + hg_thread_mutex_unlock(&poll_set->lock); + +done: + return ret; + +#if !defined(_WIN32) && !defined(HG_UTIL_HAS_SYSEPOLL_H) && !defined(HG_UTIL_HAS_SYSEVENT_H) +error: + hg_thread_mutex_unlock(&poll_set->lock); + + return ret; +#endif +} + +/*---------------------------------------------------------------------------*/ +int +hg_poll_wait(hg_poll_set_t *poll_set, unsigned int timeout, unsigned int max_events, + struct hg_poll_event *events, unsigned int *actual_events) +{ + int max_poll_events = (int)MIN(max_events, poll_set->max_events); + int nfds = 0, i; + int ret = HG_UTIL_SUCCESS; + +#if defined(_WIN32) + +#elif defined(HG_UTIL_HAS_SYSEPOLL_H) + nfds = epoll_wait(poll_set->fd, poll_set->events, max_poll_events, (int)timeout); + HG_UTIL_CHECK_ERROR(nfds == -1 && errno != EINTR, done, ret, HG_UTIL_FAIL, "epoll_wait() failed (%s)", + strerror(errno)); + + /* Handle signal interrupts */ + if (unlikely(errno == EINTR)) { + events[0].events |= HG_POLLINTR; + *actual_events = 1; + + /* Reset errno */ + errno = 0; + + return HG_UTIL_SUCCESS; + } + + for (i = 0; i < nfds; ++i) { + events[i].events = 0; + events[i].data.u64 = (hg_util_uint64_t)poll_set->events[i].data.u64; + + if (poll_set->events[i].events & EPOLLIN) + events[i].events |= HG_POLLIN; + + if (poll_set->events[i].events & EPOLLOUT) + events[i].events |= HG_POLLOUT; + + /* Don't change the if/else order */ + if (poll_set->events[i].events & EPOLLERR) + events[i].events |= HG_POLLERR; + else if (poll_set->events[i].events & EPOLLHUP) + events[i].events |= HG_POLLHUP; + else if (poll_set->events[i].events & EPOLLRDHUP) + events[i].events |= HG_POLLHUP; + } + + /* Grow array if reached max number */ + if ((nfds == (int)poll_set->max_events) && (poll_set->max_events * 2 <= HG_POLL_MAX_EVENTS)) { + poll_set->events = realloc(poll_set->events, sizeof(*poll_set->events) * poll_set->max_events * 2); + HG_UTIL_CHECK_ERROR(!poll_set->events, done, ret, HG_UTIL_FAIL, "realloc() failed (%s)", + strerror(errno)); + + poll_set->max_events *= 2; + } +#elif defined(HG_UTIL_HAS_SYSEVENT_H) + struct timespec timeout_spec; + ldiv_t ld; + + /* Get sec / nsec */ + ld = ldiv(timeout, 1000L); + timeout_spec.tv_sec = ld.quot; + timeout_spec.tv_nsec = ld.rem * 1000000L; + + nfds = kevent(poll_set->fd, NULL, 0, poll_set->events, max_poll_events, &timeout_spec); + HG_UTIL_CHECK_ERROR(nfds == -1 && errno != EINTR, done, ret, HG_UTIL_FAIL, "kevent() failed (%s)", + strerror(errno)); + + /* Handle signal interrupts */ + if (unlikely(errno == EINTR)) { + events[0].events |= HG_POLLINTR; + *actual_events = 1; + + return HG_UTIL_SUCCESS; + } + + for (i = 0; i < nfds; ++i) { + events[i].events = 0; + events[i].data.ptr = poll_set->events[i].udata; + + if (poll_set->events[i].flags & EVFILT_READ) + events[i].events |= HG_POLLIN; + + if (poll_set->events[i].flags & EVFILT_WRITE) + events[i].events |= HG_POLLOUT; + } + + /* Grow array if reached max number */ + if ((nfds == (int)poll_set->max_events) && (poll_set->max_events * 2 <= HG_POLL_MAX_EVENTS)) { + poll_set->events = realloc(poll_set->events, sizeof(*poll_set->events) * poll_set->max_events * 2); + HG_UTIL_CHECK_ERROR(!poll_set->events, done, ret, HG_UTIL_FAIL, "realloc() failed (%s)", + strerror(errno)); + + poll_set->max_events *= 2; + } +#else + int nevent = 0, rc; + hg_util_bool_t signaled; + + rc = hg_event_get(poll_set->fd, &signaled); + HG_UTIL_CHECK_ERROR(rc != HG_UTIL_SUCCESS, done, ret, HG_UTIL_FAIL, "hg_event_get() failed (%s)", + strerror(errno)); + if (signaled) { + /* Should we do anything in that case? */ + } + + hg_thread_mutex_lock(&poll_set->lock); + + /* Reset revents */ + for (i = 0; i < (int)poll_set->nfds; i++) + poll_set->events[i].revents = 0; + + nfds = poll(poll_set->events, (nfds_t)poll_set->nfds, (int)timeout); + HG_UTIL_CHECK_ERROR(nfds == -1 && errno != EINTR, unlock, ret, HG_UTIL_FAIL, "poll() failed (%s)", + strerror(errno)); + + /* Handle signal interrupts */ + if (unlikely(errno == EINTR)) { + events[0].events |= HG_POLLINTR; + *actual_events = 1; + hg_thread_mutex_unlock(&poll_set->lock); + + return HG_UTIL_SUCCESS; + } + + nfds = (int)MIN(max_poll_events, nfds); + + /* An event on one of the fds has occurred. */ + for (i = 0; i < (int)poll_set->nfds && nevent < nfds; ++i) { + events[i].events = 0; + events[i].data.u64 = (hg_util_uint64_t)poll_set->event_data[i].u64; + + if (poll_set->events[i].revents & POLLIN) + events[i].events |= HG_POLLIN; + + if (poll_set->events[i].revents & POLLOUT) + events[i].events |= HG_POLLOUT; + + /* Don't change the if/else order */ + if (poll_set->events[i].revents & POLLERR) + events[i].events |= HG_POLLERR; + else if (poll_set->events[i].revents & POLLHUP) + events[i].events |= HG_POLLHUP; + else if (poll_set->events[i].events & POLLNVAL) + events[i].events |= HG_POLLERR; + + nevent++; + } + + hg_thread_mutex_unlock(&poll_set->lock); + + HG_UTIL_CHECK_ERROR(nevent != nfds, done, ret, HG_UTIL_FAIL, "found only %d events, expected %d", nevent, + nfds); + + if (nfds > 0) { + /* TODO should figure where to call hg_event_get() */ + rc = hg_event_set(poll_set->fd); + HG_UTIL_CHECK_ERROR(rc != HG_UTIL_SUCCESS, done, ret, HG_UTIL_FAIL, "hg_event_set() failed (%s)", + strerror(errno)); + } +#endif + + *actual_events = (unsigned int)nfds; + +done: + return ret; + +#if !defined(_WIN32) && !defined(HG_UTIL_HAS_SYSEPOLL_H) && !defined(HG_UTIL_HAS_SYSEVENT_H) +unlock: + hg_thread_mutex_unlock(&poll_set->lock); + + return ret; +#endif +} diff --git a/src/mercury/src/util/mercury_poll.h b/src/mercury/src/util/mercury_poll.h new file mode 100644 index 00000000000..f4072a59041 --- /dev/null +++ b/src/mercury/src/util/mercury_poll.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_POLL_H +#define MERCURY_POLL_H + +#include "mercury_util_config.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +typedef struct hg_poll_set hg_poll_set_t; + +typedef union hg_poll_data { + void * ptr; + int fd; + hg_util_uint32_t u32; + hg_util_uint64_t u64; +} hg_poll_data_t; + +struct hg_poll_event { + hg_util_uint32_t events; /* Poll events */ + hg_poll_data_t data; /* User data variable */ +}; + +/*****************/ +/* Public Macros */ +/*****************/ + +/** + * Polling events. + */ +#define HG_POLLIN (1 << 0) /* There is data to read. */ +#define HG_POLLOUT (1 << 1) /* Writing now will not block. */ +#define HG_POLLERR (1 << 2) /* Error condition. */ +#define HG_POLLHUP (1 << 3) /* Hung up. */ +#define HG_POLLINTR (1 << 4) /* Interrupted. */ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Create a new poll set. + * + * \return Pointer to poll set or NULL in case of failure + */ +HG_UTIL_PUBLIC hg_poll_set_t *hg_poll_create(void); + +/** + * Destroy a poll set. + * + * \param poll_set [IN/OUT] pointer to poll set + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_poll_destroy(hg_poll_set_t *poll_set); + +/** + * Get a file descriptor from an existing poll set. + * + * \param poll_set [IN] pointer to poll set + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_poll_get_fd(hg_poll_set_t *poll_set); + +/** + * Add file descriptor to poll set. + * + * \param poll_set [IN] pointer to poll set + * \param fd [IN] file descriptor + * \param event [IN] pointer to event struct + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_poll_add(hg_poll_set_t *poll_set, int fd, struct hg_poll_event *event); + +/** + * Remove file descriptor from poll set. + * + * \param poll_set [IN] pointer to poll set + * \param fd [IN] file descriptor + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_poll_remove(hg_poll_set_t *poll_set, int fd); + +/** + * Wait on a poll set for timeout ms, and return at most max_events. + * + * \param poll_set [IN] pointer to poll set + * \param timeout [IN] timeout (in milliseconds) + * \param max_events [IN] max number of events + * \param events [IN/OUT] array of events to be returned + * \param actual_events [OUT] actual number of events returned + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_poll_wait(hg_poll_set_t *poll_set, unsigned int timeout, unsigned int max_events, + struct hg_poll_event events[], unsigned int *actual_events); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_POLL_H */ diff --git a/src/mercury/src/util/mercury_queue.h b/src/mercury/src/util/mercury_queue.h new file mode 100644 index 00000000000..116a209beaa --- /dev/null +++ b/src/mercury/src/util/mercury_queue.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Code below is derived from sys/queue.h which follows the below notice: + * + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + */ + +#ifndef MERCURY_QUEUE_H +#define MERCURY_QUEUE_H + +#define HG_QUEUE_HEAD_INITIALIZER(name) \ + { \ + NULL, &(name).head \ + } + +#define HG_QUEUE_HEAD_INIT(struct_head_name, var_name) \ + struct struct_head_name var_name = HG_QUEUE_HEAD_INITIALIZER(var_name) + +#define HG_QUEUE_HEAD_DECL(struct_head_name, struct_entry_name) \ + struct struct_head_name { \ + struct struct_entry_name * head; \ + struct struct_entry_name **tail; \ + } + +#define HG_QUEUE_HEAD(struct_entry_name) \ + struct { \ + struct struct_entry_name * head; \ + struct struct_entry_name **tail; \ + } + +#define HG_QUEUE_ENTRY(struct_entry_name) \ + struct { \ + struct struct_entry_name *next; \ + } + +#define HG_QUEUE_INIT(head_ptr) \ + do { \ + (head_ptr)->head = NULL; \ + (head_ptr)->tail = &(head_ptr)->head; \ + } while (/*CONSTCOND*/ 0) + +#define HG_QUEUE_IS_EMPTY(head_ptr) ((head_ptr)->head == NULL) + +#define HG_QUEUE_FIRST(head_ptr) ((head_ptr)->head) + +#define HG_QUEUE_NEXT(entry_ptr, entry_field_name) ((entry_ptr)->entry_field_name.next) + +#define HG_QUEUE_PUSH_TAIL(head_ptr, entry_ptr, entry_field_name) \ + do { \ + (entry_ptr)->entry_field_name.next = NULL; \ + *(head_ptr)->tail = (entry_ptr); \ + (head_ptr)->tail = &(entry_ptr)->entry_field_name.next; \ + } while (/*CONSTCOND*/ 0) + +/* TODO would be nice to not have any condition */ +#define HG_QUEUE_POP_HEAD(head_ptr, entry_field_name) \ + do { \ + if ((head_ptr)->head && ((head_ptr)->head = (head_ptr)->head->entry_field_name.next) == NULL) \ + (head_ptr)->tail = &(head_ptr)->head; \ + } while (/*CONSTCOND*/ 0) + +#define HG_QUEUE_FOREACH(var, head_ptr, entry_field_name) \ + for ((var) = ((head_ptr)->head); (var); (var) = ((var)->entry_field_name.next)) + +/** + * Avoid using those for performance reasons or use mercury_list.h instead + */ + +#define HG_QUEUE_REMOVE(head_ptr, entry_ptr, type, entry_field_name) \ + do { \ + if ((head_ptr)->head == (entry_ptr)) { \ + HG_QUEUE_POP_HEAD((head_ptr), entry_field_name); \ + } \ + else { \ + struct type *curelm = (head_ptr)->head; \ + while (curelm->entry_field_name.next != (entry_ptr)) \ + curelm = curelm->entry_field_name.next; \ + if ((curelm->entry_field_name.next = curelm->entry_field_name.next->entry_field_name.next) == \ + NULL) \ + (head_ptr)->tail = &(curelm)->entry_field_name.next; \ + } \ + } while (/*CONSTCOND*/ 0) + +#endif /* MERCURY_QUEUE_H */ diff --git a/src/mercury/src/util/mercury_request.c b/src/mercury/src/util/mercury_request.c new file mode 100644 index 00000000000..6951c61768f --- /dev/null +++ b/src/mercury/src/util/mercury_request.c @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_request.h" +#include "mercury_thread_condition.h" +#include "mercury_thread_mutex.h" +#include "mercury_time.h" +#include "mercury_util_error.h" + +#include + +/****************/ +/* Local Macros */ +/****************/ + +/************************************/ +/* Local Type and Struct Definition */ +/************************************/ + +struct hg_request_class { + hg_request_progress_func_t progress_func; + hg_request_trigger_func_t trigger_func; + void * arg; + hg_util_bool_t progressing; + hg_thread_mutex_t progress_mutex; + hg_thread_cond_t progress_cond; +}; + +/********************/ +/* Local Prototypes */ +/********************/ + +/*******************/ +/* Local Variables */ +/*******************/ + +/*---------------------------------------------------------------------------*/ +hg_request_class_t * +hg_request_init(hg_request_progress_func_t progress_func, hg_request_trigger_func_t trigger_func, void *arg) +{ + struct hg_request_class *hg_request_class = NULL; + + hg_request_class = (struct hg_request_class *)malloc(sizeof(struct hg_request_class)); + HG_UTIL_CHECK_ERROR_NORET(hg_request_class == NULL, done, "Could not allocate hg_request_class"); + + hg_request_class->progress_func = progress_func; + hg_request_class->trigger_func = trigger_func; + hg_request_class->arg = arg; + hg_request_class->progressing = HG_UTIL_FALSE; + hg_thread_mutex_init(&hg_request_class->progress_mutex); + hg_thread_cond_init(&hg_request_class->progress_cond); + +done: + return hg_request_class; +} + +/*---------------------------------------------------------------------------*/ +void +hg_request_finalize(hg_request_class_t *request_class, void **arg) +{ + if (!request_class) + return; + + if (arg) + *arg = request_class->arg; + hg_thread_mutex_destroy(&request_class->progress_mutex); + hg_thread_cond_destroy(&request_class->progress_cond); + free(request_class); +} + +/*---------------------------------------------------------------------------*/ +hg_request_t * +hg_request_create(hg_request_class_t *request_class) +{ + struct hg_request *hg_request = NULL; + + hg_request = (struct hg_request *)malloc(sizeof(struct hg_request)); + HG_UTIL_CHECK_ERROR_NORET(hg_request == NULL, done, "Could not allocate hg_request"); + + hg_request->request_class = request_class; + hg_request->data = NULL; + hg_atomic_init32(&hg_request->completed, HG_UTIL_FALSE); + +done: + return hg_request; +} + +/*---------------------------------------------------------------------------*/ +void +hg_request_destroy(hg_request_t *request) +{ + free(request); +} + +/*---------------------------------------------------------------------------*/ +int +hg_request_wait(hg_request_t *request, unsigned int timeout_ms, unsigned int *flag) +{ + hg_time_t deadline, remaining = hg_time_from_ms(timeout_ms); + hg_time_t now = hg_time_from_ms(0); + hg_util_int32_t completed = HG_UTIL_FALSE; + int ret = HG_UTIL_SUCCESS; + + if (timeout_ms != 0) + hg_time_get_current_ms(&now); + deadline = hg_time_add(now, remaining); + + do { + unsigned int trigger_flag = 0; + int trigger_ret; + + do { + trigger_ret = request->request_class->trigger_func(0, &trigger_flag, request->request_class->arg); + } while ((trigger_ret == HG_UTIL_SUCCESS) && trigger_flag); + + if ((completed = hg_atomic_get32(&request->completed)) == HG_UTIL_TRUE) + break; + + hg_thread_mutex_lock(&request->request_class->progress_mutex); + if (request->request_class->progressing) { + if (hg_thread_cond_timedwait(&request->request_class->progress_cond, + &request->request_class->progress_mutex, + hg_time_to_ms(remaining)) != HG_UTIL_SUCCESS) { + /* Timeout occurred so leave */ + hg_thread_mutex_unlock(&request->request_class->progress_mutex); + break; + } + /* Continue as request may have completed in the meantime */ + hg_thread_mutex_unlock(&request->request_class->progress_mutex); + goto next; + } + request->request_class->progressing = HG_UTIL_TRUE; + hg_thread_mutex_unlock(&request->request_class->progress_mutex); + + request->request_class->progress_func(hg_time_to_ms(remaining), request->request_class->arg); + + hg_thread_mutex_lock(&request->request_class->progress_mutex); + request->request_class->progressing = HG_UTIL_FALSE; + hg_thread_cond_broadcast(&request->request_class->progress_cond); + hg_thread_mutex_unlock(&request->request_class->progress_mutex); + +next: + if (timeout_ms != 0) + hg_time_get_current_ms(&now); + remaining = hg_time_subtract(deadline, now); + } while (hg_time_less(now, deadline)); + + if (flag) + *flag = (unsigned int)completed; + + return ret; +} diff --git a/src/mercury/src/util/mercury_request.h b/src/mercury/src/util/mercury_request.h new file mode 100644 index 00000000000..4d7fdf8c551 --- /dev/null +++ b/src/mercury/src/util/mercury_request.h @@ -0,0 +1,204 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_REQUEST_H +#define MERCURY_REQUEST_H + +#include "mercury_util_config.h" + +#include "mercury_atomic.h" + +/** + * Purpose: define a request emulation library on top of the callback model + * that uses progress/trigger functions. Note that this library can not be + * safely used within RPCs in most cases - calling hg_request_wait causes + * deadlock when the caller function was triggered by HG_Trigger + * (or HG_Bulk_trigger). + */ + +typedef struct hg_request_class hg_request_class_t; /* Opaque request class */ +typedef struct hg_request hg_request_t; /* Opaque request object */ + +struct hg_request { + hg_request_class_t *request_class; + void * data; + hg_atomic_int32_t completed; +}; + +/** + * Progress callback, arg can be used to pass extra parameters required by + * underlying API. + * + * \param timeout [IN] timeout (in milliseconds) + * \param arg [IN] pointer to data passed to callback + * + * \return HG_UTIL_SUCCESS if any completion has occurred / error code otherwise + */ +typedef int (*hg_request_progress_func_t)(unsigned int timeout, void *arg); + +/** + * Trigger callback, arg can be used to pass extra parameters required by + * underlying API. + * + * \param timeout [IN] timeout (in milliseconds) + * \param flag [OUT] 1 if callback has been triggered, 0 otherwise + * \param arg [IN] pointer to data passed to callback + * + * \return HG_UTIL_SUCCESS or corresponding error code + */ +typedef int (*hg_request_trigger_func_t)(unsigned int timeout, unsigned int *flag, void *arg); + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the request class with the specific progress/trigger functions + * that will be called on hg_request_wait(). + * arg can be used to pass extra parameters required by underlying API. + * + * \param progress [IN] progress function + * \param trigger [IN] trigger function + * \param arg [IN] pointer to data passed to callback + * + * \return Pointer to request class or NULL in case of failure + */ +HG_UTIL_PUBLIC hg_request_class_t *hg_request_init(hg_request_progress_func_t progress, + hg_request_trigger_func_t trigger, void *arg); + +/** + * Finalize the request class. User args that were passed through + * hg_request_init() can be retrieved through the \a arg parameter. + * + * \param request_class [IN] pointer to request class + * \param arg [IN/OUT] pointer to init args + */ +HG_UTIL_PUBLIC void hg_request_finalize(hg_request_class_t *request_class, void **arg); + +/** + * Create a new request from a specified request class. The progress function + * explicitly makes progress and may insert the completed operation into a + * completion queue. The operation gets triggered after a call to the trigger + * function. + * + * \param request_class [IN] pointer to request class + * + * \return Pointer to request or NULL in case of failure + */ +HG_UTIL_PUBLIC hg_request_t *hg_request_create(hg_request_class_t *request_class); + +/** + * Destroy the request, freeing the resources. + * + * \param request [IN/OUT] pointer to request + */ +HG_UTIL_PUBLIC void hg_request_destroy(hg_request_t *request); + +/** + * Reset an existing request so that it can be safely re-used. + * + * \param request [IN/OUT] pointer to request + */ +static HG_UTIL_INLINE void hg_request_reset(hg_request_t *request); + +/** + * Mark the request as completed. (most likely called by a callback triggered + * after a call to trigger) + * + * \param request [IN/OUT] pointer to request + */ +static HG_UTIL_INLINE void hg_request_complete(hg_request_t *request); + +/** + * Wait timeout ms for the specified request to complete. + * + * \param request [IN/OUT] pointer to request + * \param timeout [IN] timeout (in milliseconds) + * \param flag [OUT] 1 if request has completed, 0 otherwise + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_request_wait(hg_request_t *request, unsigned int timeout, unsigned int *flag); + +/** + * Wait timeout ms for all the specified request to complete. + * + * \param count [IN] number of requests + * \param request [IN/OUT] arrays of requests + * \param timeout [IN] timeout (in milliseconds) + * \param flag [OUT] 1 if all requests have completed, 0 otherwise + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_request_waitall(int count, hg_request_t *request[], unsigned int timeout, + unsigned int *flag); + +/** + * Attach user data to a specified request. + * + * \param request [IN/OUT] pointer to request + * \param data [IN] pointer to data + */ +static HG_UTIL_INLINE void hg_request_set_data(hg_request_t *request, void *data); + +/** + * Get user data from a specified request. + * + * \param request [IN/OUT] pointer to request + * + * \return Pointer to data or NULL if nothing was attached by user + */ +static HG_UTIL_INLINE void *hg_request_get_data(hg_request_t *request); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_request_reset(hg_request_t *request) +{ + hg_atomic_set32(&request->completed, HG_UTIL_FALSE); +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_request_complete(hg_request_t *request) +{ + hg_atomic_set32(&request->completed, HG_UTIL_TRUE); +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_request_waitall(int count, hg_request_t *request[], unsigned int timeout, unsigned int *flag) +{ + int i; + + for (i = 0; i < count; i++) + hg_request_wait(request[i], timeout, flag); + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_request_set_data(hg_request_t *request, void *data) +{ + request->data = data; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void * +hg_request_get_data(hg_request_t *request) +{ + return request->data; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_REQUEST_H */ diff --git a/src/mercury/src/util/mercury_thread.c b/src/mercury/src/util/mercury_thread.c new file mode 100644 index 00000000000..3b1f9a98533 --- /dev/null +++ b/src/mercury/src/util/mercury_thread.c @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_thread.h" + +/*---------------------------------------------------------------------------*/ +void +hg_thread_init(hg_thread_t *thread) +{ +#ifdef _WIN32 + *thread = NULL; +#else + *thread = 0; +#endif +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_create(hg_thread_t *thread, hg_thread_func_t f, void *data) +{ +#ifdef _WIN32 + *thread = CreateThread(NULL, 0, f, data, 0, NULL); + if (*thread == NULL) + return HG_UTIL_FAIL; +#else + if (pthread_create(thread, NULL, f, data)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +void +hg_thread_exit(hg_thread_ret_t ret) +{ +#ifdef _WIN32 + ExitThread(ret); +#else + pthread_exit(ret); +#endif +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_join(hg_thread_t thread) +{ +#ifdef _WIN32 + WaitForSingleObject(thread, INFINITE); + CloseHandle(thread); +#else + if (pthread_join(thread, NULL)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_cancel(hg_thread_t thread) +{ +#ifdef _WIN32 + WaitForSingleObject(thread, 0); + CloseHandle(thread); +#else + if (pthread_cancel(thread)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_yield(void) +{ +#ifdef _WIN32 + SwitchToThread(); +#elif defined(__APPLE__) + pthread_yield_np(); +#else + pthread_yield(); +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_key_create(hg_thread_key_t *key) +{ + if (!key) + return HG_UTIL_FAIL; + +#ifdef _WIN32 + if ((*key = TlsAlloc()) == TLS_OUT_OF_INDEXES) + return HG_UTIL_FAIL; +#else + if (pthread_key_create(key, NULL)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_key_delete(hg_thread_key_t key) +{ +#ifdef _WIN32 + if (!TlsFree(key)) + return HG_UTIL_FAIL; +#else + if (pthread_key_delete(key)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_getaffinity(hg_thread_t thread, hg_cpu_set_t *cpu_mask) +{ +#if defined(_WIN32) + return HG_UTIL_FAIL; +#elif defined(__APPLE__) + (void)thread; + (void)cpu_mask; + return HG_UTIL_FAIL; +#else + if (pthread_getaffinity_np(thread, sizeof(hg_cpu_set_t), cpu_mask)) + return HG_UTIL_FAIL; + return HG_UTIL_SUCCESS; +#endif +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_setaffinity(hg_thread_t thread, const hg_cpu_set_t *cpu_mask) +{ +#if defined(_WIN32) + if (!SetThreadAffinityMask(thread, *cpu_mask)) + return HG_UTIL_FAIL; +#elif defined(__APPLE__) + (void)thread; + (void)cpu_mask; + return HG_UTIL_FAIL; +#else + if (pthread_setaffinity_np(thread, sizeof(hg_cpu_set_t), cpu_mask)) + return HG_UTIL_FAIL; + return HG_UTIL_SUCCESS; +#endif +} diff --git a/src/mercury/src/util/mercury_thread.h b/src/mercury/src/util/mercury_thread.h new file mode 100644 index 00000000000..3317c41c287 --- /dev/null +++ b/src/mercury/src/util/mercury_thread.h @@ -0,0 +1,228 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_THREAD_H +#define MERCURY_THREAD_H + +#if !defined(_WIN32) && !defined(_GNU_SOURCE) +#define _GNU_SOURCE +#endif +#include "mercury_util_config.h" + +#ifdef _WIN32 +#include +typedef HANDLE hg_thread_t; +typedef LPTHREAD_START_ROUTINE hg_thread_func_t; +typedef DWORD hg_thread_ret_t; +#define HG_THREAD_RETURN_TYPE hg_thread_ret_t WINAPI +typedef DWORD hg_thread_key_t; +typedef DWORD_PTR hg_cpu_set_t; +#else +#include +typedef pthread_t hg_thread_t; +typedef void *(*hg_thread_func_t)(void *); +typedef void * hg_thread_ret_t; +#define HG_THREAD_RETURN_TYPE hg_thread_ret_t +typedef pthread_key_t hg_thread_key_t; +#ifdef __APPLE__ +/* Size definition for CPU sets. */ +#define HG_CPU_SETSIZE 1024 +#define HG_NCPUBITS (8 * sizeof(hg_cpu_mask_t)) +/* Type for array elements in 'cpu_set_t'. */ +typedef hg_util_uint64_t hg_cpu_mask_t; +typedef struct { + hg_cpu_mask_t bits[HG_CPU_SETSIZE / HG_NCPUBITS]; +} hg_cpu_set_t; +#else +typedef cpu_set_t hg_cpu_set_t; +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the thread. + * + * \param thread [IN/OUT] pointer to thread object + */ +HG_UTIL_PUBLIC void hg_thread_init(hg_thread_t *thread); + +/** + * Create a new thread for the given function. + * + * \param thread [IN/OUT] pointer to thread object + * \param f [IN] pointer to function + * \param data [IN] pointer to data than be passed to function f + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_create(hg_thread_t *thread, hg_thread_func_t f, void *data); + +/** + * Ends the calling thread. + * + * \param ret [IN] exit code for the thread + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC void hg_thread_exit(hg_thread_ret_t ret); + +/** + * Wait for thread completion. + * + * \param thread [IN] thread object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_join(hg_thread_t thread); + +/** + * Terminate the thread. + * + * \param thread [IN] thread object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_cancel(hg_thread_t thread); + +/** + * Yield the processor. + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_yield(void); + +/** + * Obtain handle of the calling thread. + * + * \return + */ +static HG_UTIL_INLINE hg_thread_t hg_thread_self(void); + +/** + * Compare thread IDs. + * + * \return Non-zero if equal, zero if not equal + */ +static HG_UTIL_INLINE int hg_thread_equal(hg_thread_t t1, hg_thread_t t2); + +/** + * Create a thread-specific data key visible to all threads in the process. + * + * \param key [OUT] pointer to thread key object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_key_create(hg_thread_key_t *key); + +/** + * Delete a thread-specific data key previously returned by + * hg_thread_key_create(). + * + * \param key [IN] thread key object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_key_delete(hg_thread_key_t key); + +/** + * Get value from specified key. + * + * \param key [IN] thread key object + * + * \return Pointer to data associated to the key + */ +static HG_UTIL_INLINE void *hg_thread_getspecific(hg_thread_key_t key); + +/** + * Set value to specified key. + * + * \param key [IN] thread key object + * \param value [IN] pointer to data that will be associated + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_setspecific(hg_thread_key_t key, const void *value); + +/** + * Get affinity mask. + * + * \param thread [IN] thread object + * \param cpu_mask [IN/OUT] cpu mask + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_getaffinity(hg_thread_t thread, hg_cpu_set_t *cpu_mask); + +/** + * Set affinity mask. + * + * \param thread [IN] thread object + * \param cpu_mask [IN] cpu mask + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_setaffinity(hg_thread_t thread, const hg_cpu_set_t *cpu_mask); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_thread_t +hg_thread_self(void) +{ +#ifdef _WIN32 + return GetCurrentThread(); +#else + return pthread_self(); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_equal(hg_thread_t t1, hg_thread_t t2) +{ +#ifdef _WIN32 + return GetThreadId(t1) == GetThreadId(t2); +#else + return pthread_equal(t1, t2); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void * +hg_thread_getspecific(hg_thread_key_t key) +{ +#ifdef _WIN32 + return TlsGetValue(key); +#else + return pthread_getspecific(key); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_setspecific(hg_thread_key_t key, const void *value) +{ +#ifdef _WIN32 + if (!TlsSetValue(key, (LPVOID)value)) + return HG_UTIL_FAIL; +#else + if (pthread_setspecific(key, value)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_THREAD_H */ diff --git a/src/mercury/src/util/mercury_thread_annotation.h b/src/mercury/src/util/mercury_thread_annotation.h new file mode 100644 index 00000000000..f8613a4d72b --- /dev/null +++ b/src/mercury/src/util/mercury_thread_annotation.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_THREAD_ANNOTATION_H +#define MERCURY_THREAD_ANNOTATION_H + +/* Enable thread safety attributes only with clang. + * The attributes can be safely erased when compiling with other compilers. */ +#if defined(__clang__) && (__clang_major__ > 3) +#define HG_THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x)) +#else +#define HG_THREAD_ANNOTATION_ATTRIBUTE__(x) // no-op +#endif + +#define HG_LOCK_CAPABILITY(x) HG_THREAD_ANNOTATION_ATTRIBUTE__(capability(x)) + +#define HG_LOCK_ACQUIRE(...) HG_THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__)) + +#define HG_LOCK_ACQUIRE_SHARED(...) HG_THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__)) + +#define HG_LOCK_RELEASE(...) HG_THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__)) + +#define HG_LOCK_RELEASE_SHARED(...) HG_THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__)) + +#define HG_LOCK_TRY_ACQUIRE(...) HG_THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__)) + +#define HG_LOCK_TRY_ACQUIRE_SHARED(...) \ + HG_THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__)) + +#define HG_LOCK_NO_THREAD_SAFETY_ANALYSIS HG_THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis) + +#endif /* MERCURY_THREAD_ANNOTATION_H */ diff --git a/src/mercury/src/util/mercury_thread_condition.c b/src/mercury/src/util/mercury_thread_condition.c new file mode 100644 index 00000000000..35133eaddd1 --- /dev/null +++ b/src/mercury/src/util/mercury_thread_condition.c @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_thread_condition.h" + +/*---------------------------------------------------------------------------*/ +int +hg_thread_cond_init(hg_thread_cond_t *cond) +{ +#ifdef _WIN32 + InitializeConditionVariable(cond); +#else + pthread_condattr_t attr; + + pthread_condattr_init(&attr); +#if defined(HG_UTIL_HAS_PTHREAD_CONDATTR_SETCLOCK) && defined(HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE) + /* Must set clock ID if using different clock + * (CLOCK_MONOTONIC_COARSE not supported here) */ + pthread_condattr_setclock(&attr, CLOCK_MONOTONIC); +#endif + if (pthread_cond_init(cond, &attr)) + return HG_UTIL_FAIL; + pthread_condattr_destroy(&attr); +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_cond_destroy(hg_thread_cond_t *cond) +{ +#ifndef _WIN32 + if (pthread_cond_destroy(cond)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} diff --git a/src/mercury/src/util/mercury_thread_condition.h b/src/mercury/src/util/mercury_thread_condition.h new file mode 100644 index 00000000000..c1a3d61dc0b --- /dev/null +++ b/src/mercury/src/util/mercury_thread_condition.h @@ -0,0 +1,176 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_THREAD_CONDITION_H +#define MERCURY_THREAD_CONDITION_H + +#include "mercury_thread_mutex.h" + +#ifdef _WIN32 +typedef CONDITION_VARIABLE hg_thread_cond_t; +#else +#if defined(HG_UTIL_HAS_PTHREAD_CONDATTR_SETCLOCK) && defined(HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE) +#include +#elif defined(HG_UTIL_HAS_SYSTIME_H) +#include +#endif +#include +typedef pthread_cond_t hg_thread_cond_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the condition. + * + * \param cond [IN/OUT] pointer to condition object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_cond_init(hg_thread_cond_t *cond); + +/** + * Destroy the condition. + * + * \param cond [IN/OUT] pointer to condition object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_cond_destroy(hg_thread_cond_t *cond); + +/** + * Wake one thread waiting for the condition to change. + * + * \param cond [IN/OUT] pointer to condition object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_cond_signal(hg_thread_cond_t *cond); + +/** + * Wake all the threads waiting for the condition to change. + * + * \param cond [IN/OUT] pointer to condition object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_cond_broadcast(hg_thread_cond_t *cond); + +/** + * Wait for the condition to change. + * + * \param cond [IN/OUT] pointer to condition object + * \param mutex [IN/OUT] pointer to mutex object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_cond_wait(hg_thread_cond_t *cond, hg_thread_mutex_t *mutex); + +/** + * Wait timeout ms for the condition to change. + * + * \param cond [IN/OUT] pointer to condition object + * \param mutex [IN/OUT] pointer to mutex object + * \param timeout [IN] timeout (in milliseconds) + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_cond_timedwait(hg_thread_cond_t *cond, hg_thread_mutex_t *mutex, + unsigned int timeout); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_cond_signal(hg_thread_cond_t *cond) +{ +#ifdef _WIN32 + WakeConditionVariable(cond); +#else + if (pthread_cond_signal(cond)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_cond_broadcast(hg_thread_cond_t *cond) +{ +#ifdef _WIN32 + WakeAllConditionVariable(cond); +#else + if (pthread_cond_broadcast(cond)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_cond_wait(hg_thread_cond_t *cond, hg_thread_mutex_t *mutex) +{ +#ifdef _WIN32 + if (!SleepConditionVariableCS(cond, mutex, INFINITE)) + return HG_UTIL_FAIL; +#else + if (pthread_cond_wait(cond, mutex)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_cond_timedwait(hg_thread_cond_t *cond, hg_thread_mutex_t *mutex, unsigned int timeout) +{ +#ifdef _WIN32 + if (!SleepConditionVariableCS(cond, mutex, timeout)) + return HG_UTIL_FAIL; +#else +#if defined(HG_UTIL_HAS_PTHREAD_CONDATTR_SETCLOCK) && defined(HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE) + struct timespec now; +#else + struct timeval now; +#endif + struct timespec abs_timeout; + ldiv_t ld; + + /* Need to convert timeout (ms) to absolute time */ +#if defined(HG_UTIL_HAS_PTHREAD_CONDATTR_SETCLOCK) && defined(HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE) + clock_gettime(CLOCK_MONOTONIC_COARSE, &now); + + /* Get sec / nsec */ + ld = ldiv(now.tv_nsec + timeout * 1000000L, 1000000000L); + abs_timeout.tv_nsec = ld.rem; +#elif defined(HG_UTIL_HAS_SYSTIME_H) + gettimeofday(&now, NULL); + + /* Get sec / usec */ + ld = ldiv(now.tv_usec + timeout * 1000L, 1000000L); + abs_timeout.tv_nsec = ld.rem * 1000L; +#endif + abs_timeout.tv_sec = now.tv_sec + ld.quot; + + if (pthread_cond_timedwait(cond, mutex, &abs_timeout)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_THREAD_CONDITION_H */ diff --git a/src/mercury/src/util/mercury_thread_mutex.c b/src/mercury/src/util/mercury_thread_mutex.c new file mode 100644 index 00000000000..5a5d978b514 --- /dev/null +++ b/src/mercury/src/util/mercury_thread_mutex.c @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_thread_mutex.h" + +#include "mercury_util_error.h" + +#include + +#ifndef _WIN32 +static int +hg_thread_mutex_init_posix(hg_thread_mutex_t *mutex, int kind) +{ + pthread_mutexattr_t mutex_attr; + int ret = HG_UTIL_SUCCESS; + int rc; + + rc = pthread_mutexattr_init(&mutex_attr); + HG_UTIL_CHECK_ERROR(rc != 0, done, ret, HG_UTIL_FAIL, "pthread_mutexattr_init() failed (%s)", + strerror(rc)); + + /* Keep mutex mode as normal and do not expect error checking */ + rc = pthread_mutexattr_settype(&mutex_attr, kind); + HG_UTIL_CHECK_ERROR(rc != 0, done, ret, HG_UTIL_FAIL, "pthread_mutexattr_settype() failed (%s)", + strerror(rc)); + + rc = pthread_mutex_init(mutex, &mutex_attr); + HG_UTIL_CHECK_ERROR(rc != 0, done, ret, HG_UTIL_FAIL, "pthread_mutex_init() failed (%s)", strerror(rc)); + +done: + rc = pthread_mutexattr_destroy(&mutex_attr); + HG_UTIL_CHECK_ERROR_DONE(rc != 0, "pthread_mutexattr_destroy() failed (%s)", strerror(rc)); + + return ret; +} +#endif + +/*---------------------------------------------------------------------------*/ +int +hg_thread_mutex_init(hg_thread_mutex_t *mutex) +{ + int ret = HG_UTIL_SUCCESS; + +#ifdef _WIN32 + InitializeCriticalSection(mutex); +#else + ret = hg_thread_mutex_init_posix(mutex, PTHREAD_MUTEX_NORMAL); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_mutex_init_fast(hg_thread_mutex_t *mutex) +{ + int ret = HG_UTIL_SUCCESS; + +#ifdef HG_UTIL_HAS_PTHREAD_MUTEX_ADAPTIVE_NP + /* Set type to PTHREAD_MUTEX_ADAPTIVE_NP to improve performance */ + ret = hg_thread_mutex_init_posix(mutex, PTHREAD_MUTEX_ADAPTIVE_NP); +#else + ret = hg_thread_mutex_init_posix(mutex, PTHREAD_MUTEX_NORMAL); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_mutex_destroy(hg_thread_mutex_t *mutex) +{ + int ret = HG_UTIL_SUCCESS; + +#ifdef _WIN32 + DeleteCriticalSection(mutex); +#else + int rc; + + rc = pthread_mutex_destroy(mutex); + HG_UTIL_CHECK_ERROR(rc != 0, done, ret, HG_UTIL_FAIL, "pthread_mutex_destroy() failed (%s)", + strerror(rc)); + +done: +#endif + return ret; +} diff --git a/src/mercury/src/util/mercury_thread_mutex.h b/src/mercury/src/util/mercury_thread_mutex.h new file mode 100644 index 00000000000..b400952c884 --- /dev/null +++ b/src/mercury/src/util/mercury_thread_mutex.h @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_THREAD_MUTEX_H +#define MERCURY_THREAD_MUTEX_H + +#include "mercury_util_config.h" + +#include "mercury_thread_annotation.h" + +#ifdef _WIN32 +#include +#define HG_THREAD_MUTEX_INITIALIZER NULL +typedef CRITICAL_SECTION hg_thread_mutex_t; +#else +#include +#define HG_THREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER +typedef pthread_mutex_t HG_LOCK_CAPABILITY("mutex") hg_thread_mutex_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the mutex. + * + * \param mutex [IN/OUT] pointer to mutex object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_mutex_init(hg_thread_mutex_t *mutex); + +/** + * Initialize the mutex, asking for "fast" mutex. + * + * \param mutex [IN/OUT] pointer to mutex object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_mutex_init_fast(hg_thread_mutex_t *mutex); + +/** + * Destroy the mutex. + * + * \param mutex [IN/OUT] pointer to mutex object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_mutex_destroy(hg_thread_mutex_t *mutex); + +/** + * Lock the mutex. + * + * \param mutex [IN/OUT] pointer to mutex object + */ +static HG_UTIL_INLINE void hg_thread_mutex_lock(hg_thread_mutex_t *mutex) HG_LOCK_ACQUIRE(*mutex); + +/** + * Try locking the mutex. + * + * \param mutex [IN/OUT] pointer to mutex object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_mutex_try_lock(hg_thread_mutex_t *mutex) + HG_LOCK_TRY_ACQUIRE(HG_UTIL_SUCCESS, *mutex); + +/** + * Unlock the mutex. + * + * \param mutex [IN/OUT] pointer to mutex object + */ +static HG_UTIL_INLINE void hg_thread_mutex_unlock(hg_thread_mutex_t *mutex) HG_LOCK_RELEASE(*mutex); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_mutex_lock(hg_thread_mutex_t *mutex) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + EnterCriticalSection(mutex); +#else + (void)pthread_mutex_lock(mutex); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_mutex_try_lock(hg_thread_mutex_t *mutex) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + if (!TryEnterCriticalSection(mutex)) + return HG_UTIL_FAIL; +#else + if (pthread_mutex_trylock(mutex)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_mutex_unlock(hg_thread_mutex_t *mutex) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + LeaveCriticalSection(mutex); +#else + (void)pthread_mutex_unlock(mutex); +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_THREAD_MUTEX_H */ diff --git a/src/mercury/src/util/mercury_thread_pool.c b/src/mercury/src/util/mercury_thread_pool.c new file mode 100644 index 00000000000..eb2d7da0cb0 --- /dev/null +++ b/src/mercury/src/util/mercury_thread_pool.c @@ -0,0 +1,179 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_thread_pool.h" + +#include "mercury_util_error.h" + +#include + +/****************/ +/* Local Macros */ +/****************/ + +/************************************/ +/* Local Type and Struct Definition */ +/************************************/ + +struct hg_thread_pool_private { + struct hg_thread_pool pool; + unsigned int thread_count; + hg_thread_t * threads; +}; + +/********************/ +/* Local Prototypes */ +/********************/ + +/** + * Worker thread run by the thread pool + */ +static HG_THREAD_RETURN_TYPE hg_thread_pool_worker(void *args); + +/*******************/ +/* Local Variables */ +/*******************/ + +/*---------------------------------------------------------------------------*/ +static HG_THREAD_RETURN_TYPE +hg_thread_pool_worker(void *args) +{ + hg_thread_ret_t ret = 0; + hg_thread_pool_t * pool = (hg_thread_pool_t *)args; + struct hg_thread_work *work; + + while (1) { + hg_thread_mutex_lock(&pool->mutex); + + /* If not shutting down and nothing to do, worker sleeps */ + while (!pool->shutdown && HG_QUEUE_IS_EMPTY(&pool->queue)) { + int rc; + + pool->sleeping_worker_count++; + + rc = hg_thread_cond_wait(&pool->cond, &pool->mutex); + HG_UTIL_CHECK_ERROR_NORET(rc != HG_UTIL_SUCCESS, unlock, + "Thread cannot wait on condition variable"); + + pool->sleeping_worker_count--; + } + + if (pool->shutdown && HG_QUEUE_IS_EMPTY(&pool->queue)) + goto unlock; + + /* Grab our task */ + work = HG_QUEUE_FIRST(&pool->queue); + HG_QUEUE_POP_HEAD(&pool->queue, entry); + + /* Unlock */ + hg_thread_mutex_unlock(&pool->mutex); + + /* Get to work */ + (*work->func)(work->args); + } + +unlock: + hg_thread_mutex_unlock(&pool->mutex); + + return ret; +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_pool_init(unsigned int thread_count, hg_thread_pool_t **pool_ptr) +{ + int ret = HG_UTIL_SUCCESS, rc; + struct hg_thread_pool_private *priv_pool = NULL; + unsigned int i; + + HG_UTIL_CHECK_ERROR(pool_ptr == NULL, error, ret, HG_UTIL_FAIL, "NULL pointer"); + + priv_pool = (struct hg_thread_pool_private *)malloc(sizeof(struct hg_thread_pool_private)); + HG_UTIL_CHECK_ERROR(priv_pool == NULL, error, ret, HG_UTIL_FAIL, "Could not allocate thread pool"); + + priv_pool->pool.sleeping_worker_count = 0; + priv_pool->thread_count = thread_count; + priv_pool->threads = NULL; + HG_QUEUE_INIT(&priv_pool->pool.queue); + priv_pool->pool.shutdown = 0; + + rc = hg_thread_mutex_init(&priv_pool->pool.mutex); + HG_UTIL_CHECK_ERROR(rc != HG_UTIL_SUCCESS, error, ret, HG_UTIL_FAIL, "Could not initialize mutex"); + + rc = hg_thread_cond_init(&priv_pool->pool.cond); + HG_UTIL_CHECK_ERROR(rc != HG_UTIL_SUCCESS, error, ret, HG_UTIL_FAIL, + "Could not initialize thread condition"); + + priv_pool->threads = (hg_thread_t *)malloc(thread_count * sizeof(hg_thread_t)); + HG_UTIL_CHECK_ERROR(!priv_pool->threads, error, ret, HG_UTIL_FAIL, + "Could not allocate thread pool array"); + + /* Start worker threads */ + for (i = 0; i < thread_count; i++) { + rc = hg_thread_create(&priv_pool->threads[i], hg_thread_pool_worker, (void *)priv_pool); + HG_UTIL_CHECK_ERROR(rc != HG_UTIL_SUCCESS, error, ret, HG_UTIL_FAIL, "Could not create thread"); + } + + *pool_ptr = (struct hg_thread_pool *)priv_pool; + + return ret; + +error: + if (priv_pool) + hg_thread_pool_destroy((struct hg_thread_pool *)priv_pool); + + return ret; +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_pool_destroy(hg_thread_pool_t *pool) +{ + struct hg_thread_pool_private *priv_pool = (struct hg_thread_pool_private *)pool; + int ret = HG_UTIL_SUCCESS, rc; + unsigned int i; + + if (!priv_pool) + goto done; + + if (priv_pool->threads) { + hg_thread_mutex_lock(&priv_pool->pool.mutex); + + priv_pool->pool.shutdown = 1; + + rc = hg_thread_cond_broadcast(&priv_pool->pool.cond); + HG_UTIL_CHECK_ERROR(rc != HG_UTIL_SUCCESS, error, ret, HG_UTIL_FAIL, + "Could not broadcast condition signal"); + + hg_thread_mutex_unlock(&priv_pool->pool.mutex); + + for (i = 0; i < priv_pool->thread_count; i++) { + rc = hg_thread_join(priv_pool->threads[i]); + HG_UTIL_CHECK_ERROR(rc != HG_UTIL_SUCCESS, done, ret, HG_UTIL_FAIL, "Could not join thread"); + } + } + + rc = hg_thread_mutex_destroy(&priv_pool->pool.mutex); + HG_UTIL_CHECK_ERROR(rc != HG_UTIL_SUCCESS, done, ret, HG_UTIL_FAIL, "Could not destroy mutex"); + + rc = hg_thread_cond_destroy(&priv_pool->pool.cond); + HG_UTIL_CHECK_ERROR(rc != HG_UTIL_SUCCESS, done, ret, HG_UTIL_FAIL, "Could not destroy thread condition"); + + free(priv_pool->threads); + free(priv_pool); + +done: + return ret; + +error: + hg_thread_mutex_unlock(&priv_pool->pool.mutex); + + return ret; +} diff --git a/src/mercury/src/util/mercury_thread_pool.h b/src/mercury/src/util/mercury_thread_pool.h new file mode 100644 index 00000000000..db973d13937 --- /dev/null +++ b/src/mercury/src/util/mercury_thread_pool.h @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_THREAD_POOL_H +#define MERCURY_THREAD_POOL_H + +#include "mercury_queue.h" +#include "mercury_thread.h" +#include "mercury_thread_condition.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +typedef struct hg_thread_pool hg_thread_pool_t; + +struct hg_thread_pool { + unsigned int sleeping_worker_count; + HG_QUEUE_HEAD(hg_thread_work) queue; + int shutdown; + hg_thread_mutex_t mutex; + hg_thread_cond_t cond; +}; + +struct hg_thread_work { + hg_thread_func_t func; + void * args; + HG_QUEUE_ENTRY(hg_thread_work) entry; /* Internal */ +}; + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the thread pool. + * + * \param thread_count [IN] number of threads that will be created at + * initialization + * \param pool [OUT] pointer to pool object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_pool_init(unsigned int thread_count, hg_thread_pool_t **pool); + +/** + * Destroy the thread pool. + * + * \param pool [IN/OUT] pointer to pool object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_pool_destroy(hg_thread_pool_t *pool); + +/** + * Post work to the pool. Note that the operation may be queued depending on + * the number of threads and number of tasks already running. + * + * \param pool [IN/OUT] pointer to pool object + * \param work [IN] pointer to work struct + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_pool_post(hg_thread_pool_t *pool, struct hg_thread_work *work); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_pool_post(hg_thread_pool_t *pool, struct hg_thread_work *work) +{ + int ret = HG_UTIL_SUCCESS; + + if (!pool || !work) + return HG_UTIL_FAIL; + + if (!work->func) + return HG_UTIL_FAIL; + + hg_thread_mutex_lock(&pool->mutex); + + /* Are we shutting down ? */ + if (pool->shutdown) { + ret = HG_UTIL_FAIL; + goto unlock; + } + + /* Add task to task queue */ + HG_QUEUE_PUSH_TAIL(&pool->queue, work, entry); + + /* Wake up sleeping worker */ + if (pool->sleeping_worker_count && (hg_thread_cond_signal(&pool->cond) != HG_UTIL_SUCCESS)) + ret = HG_UTIL_FAIL; + +unlock: + hg_thread_mutex_unlock(&pool->mutex); + + return ret; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_THREAD_POOL_H */ diff --git a/src/mercury/src/util/mercury_thread_rwlock.c b/src/mercury/src/util/mercury_thread_rwlock.c new file mode 100644 index 00000000000..9ef888999c1 --- /dev/null +++ b/src/mercury/src/util/mercury_thread_rwlock.c @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Copyright (C) 2017 Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted for any purpose (including commercial purposes) + * provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions, and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions, and the following disclaimer in the + * documentation and/or materials provided with the distribution. + * + * 3. In addition, redistributions of modified forms of the source or binary + * code must carry prominent notices stating that the original code was + * changed and the date of the change. + * + * 4. All publications or advertising materials mentioning features or use of + * this software are asked, but not required, to acknowledge that it was + * developed by Intel Corporation and credit the contributors. + * + * 5. Neither the name of Intel Corporation, nor the name of any Contributor + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "mercury_thread_rwlock.h" + +#include "mercury_util_error.h" + +#include + +/*---------------------------------------------------------------------------*/ +int +hg_thread_rwlock_init(hg_thread_rwlock_t *rwlock) +{ + int ret = HG_UTIL_SUCCESS; + +#ifdef _WIN32 + InitializeSRWLock(rwlock); +#else + int rc = pthread_rwlock_init(rwlock, NULL); + HG_UTIL_CHECK_ERROR(rc != 0, done, ret, HG_UTIL_FAIL, "pthread_rwlock_init() failed (%s)", strerror(rc)); + +done: +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_rwlock_destroy(hg_thread_rwlock_t *rwlock) +{ + int ret = HG_UTIL_SUCCESS; + +#ifdef _WIN32 + /* nothing to do */ +#else + int rc = pthread_rwlock_destroy(rwlock); + HG_UTIL_CHECK_ERROR(rc != 0, done, ret, HG_UTIL_FAIL, "pthread_rwlock_destroy() failed (%s)", + strerror(rc)); + +done: +#endif + + return ret; +} diff --git a/src/mercury/src/util/mercury_thread_rwlock.h b/src/mercury/src/util/mercury_thread_rwlock.h new file mode 100644 index 00000000000..f03d2aa3372 --- /dev/null +++ b/src/mercury/src/util/mercury_thread_rwlock.h @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Copyright (C) 2017 Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted for any purpose (including commercial purposes) + * provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions, and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions, and the following disclaimer in the + * documentation and/or materials provided with the distribution. + * + * 3. In addition, redistributions of modified forms of the source or binary + * code must carry prominent notices stating that the original code was + * changed and the date of the change. + * + * 4. All publications or advertising materials mentioning features or use of + * this software are asked, but not required, to acknowledge that it was + * developed by Intel Corporation and credit the contributors. + * + * 5. Neither the name of Intel Corporation, nor the name of any Contributor + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MERCURY_THREAD_RWLOCK_H +#define MERCURY_THREAD_RWLOCK_H + +#include "mercury_util_config.h" + +#include "mercury_thread_annotation.h" + +#ifdef _WIN32 +#include +typedef PSRWLOCK hg_thread_rwlock_t; +#else +#include +typedef pthread_rwlock_t HG_LOCK_CAPABILITY("rwlock") hg_thread_rwlock_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_rwlock_init(hg_thread_rwlock_t *rwlock); + +/** + * Destroy the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_rwlock_destroy(hg_thread_rwlock_t *rwlock); + +/** + * Take a read lock for the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + */ +static HG_UTIL_INLINE void hg_thread_rwlock_rdlock(hg_thread_rwlock_t *rwlock) + HG_LOCK_ACQUIRE_SHARED(*rwlock); + +/** + * Try to take a read lock for the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_rwlock_try_rdlock(hg_thread_rwlock_t *rwlock) + HG_LOCK_TRY_ACQUIRE_SHARED(HG_UTIL_SUCCESS, *rwlock); + +/** + * Release the read lock of the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + */ +static HG_UTIL_INLINE void hg_thread_rwlock_release_rdlock(hg_thread_rwlock_t *rwlock) + HG_LOCK_RELEASE_SHARED(*rwlock); + +/** + * Take a write lock for the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + */ +static HG_UTIL_INLINE void hg_thread_rwlock_wrlock(hg_thread_rwlock_t *rwlock) HG_LOCK_ACQUIRE(*rwlock); + +/** + * Try to take a write lock for the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_rwlock_try_wrlock(hg_thread_rwlock_t *rwlock) + HG_LOCK_TRY_ACQUIRE(HG_UTIL_SUCCESS, *rwlock); + +/** + * Release the write lock of the rwlock. + * + * \param rwlock [IN/OUT] pointer to rwlock object + */ +static HG_UTIL_INLINE void hg_thread_rwlock_release_wrlock(hg_thread_rwlock_t *rwlock) + HG_LOCK_RELEASE(*rwlock); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_rwlock_rdlock(hg_thread_rwlock_t *rwlock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + AcquireSRWLockShared(rwlock); +#else + (void)pthread_rwlock_rdlock(rwlock); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_rwlock_try_rdlock(hg_thread_rwlock_t *rwlock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + if (TryAcquireSRWLockShared(rwlock) == 0) + return HG_UTIL_FAIL; +#else + if (pthread_rwlock_tryrdlock(rwlock)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_rwlock_release_rdlock(hg_thread_rwlock_t *rwlock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + ReleaseSRWLockShared(rwlock); +#else + (void)pthread_rwlock_unlock(rwlock); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_rwlock_wrlock(hg_thread_rwlock_t *rwlock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + ReleaseSRWLockExclusive(rwlock); +#else + (void)pthread_rwlock_wrlock(rwlock); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_rwlock_try_wrlock(hg_thread_rwlock_t *rwlock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + if (TryAcquireSRWLockExclusive(rwlock) == 0) + return HG_UTIL_FAIL; +#else + if (pthread_rwlock_trywrlock(rwlock)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_rwlock_release_wrlock(hg_thread_rwlock_t *rwlock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#ifdef _WIN32 + ReleaseSRWLockExclusive(rwlock); +#else + (void)pthread_rwlock_unlock(rwlock); +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_THREAD_RWLOCK_H */ diff --git a/src/mercury/src/util/mercury_thread_spin.c b/src/mercury/src/util/mercury_thread_spin.c new file mode 100644 index 00000000000..c96f9fb7aaf --- /dev/null +++ b/src/mercury/src/util/mercury_thread_spin.c @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_thread_spin.h" + +#include "mercury_util_error.h" + +#include + +/*---------------------------------------------------------------------------*/ +int +hg_thread_spin_init(hg_thread_spin_t *lock) +{ + int ret = HG_UTIL_SUCCESS; + +#if defined(_WIN32) + *lock = 0; +#elif defined(HG_UTIL_HAS_PTHREAD_SPINLOCK_T) + int rc = pthread_spin_init(lock, 0); + HG_UTIL_CHECK_ERROR(rc != 0, done, ret, HG_UTIL_FAIL, "pthread_spin_init() failed (%s)", strerror(rc)); + +done: +#else + ret = hg_thread_mutex_init_fast(lock); +#endif + + return ret; +} + +/*---------------------------------------------------------------------------*/ +int +hg_thread_spin_destroy(hg_thread_spin_t *lock) +{ + int ret = HG_UTIL_SUCCESS; + +#if defined(_WIN32) + (void)lock; +#elif defined(HG_UTIL_HAS_PTHREAD_SPINLOCK_T) + int rc = pthread_spin_destroy(lock); + HG_UTIL_CHECK_ERROR(rc != 0, done, ret, HG_UTIL_FAIL, "pthread_spin_destroy() failed (%s)", strerror(rc)); + +done: +#else + ret = hg_thread_mutex_destroy(lock); +#endif + + return ret; +} diff --git a/src/mercury/src/util/mercury_thread_spin.h b/src/mercury/src/util/mercury_thread_spin.h new file mode 100644 index 00000000000..36ce5f8ef32 --- /dev/null +++ b/src/mercury/src/util/mercury_thread_spin.h @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_THREAD_SPIN_H +#define MERCURY_THREAD_SPIN_H + +#include "mercury_util_config.h" + +#include "mercury_thread_annotation.h" + +#if defined(_WIN32) +#include +typedef volatile LONG hg_thread_spin_t; +#elif defined(HG_UTIL_HAS_PTHREAD_SPINLOCK_T) +#include +typedef pthread_spinlock_t HG_LOCK_CAPABILITY("spin") hg_thread_spin_t; +#else +/* Default to hg_thread_mutex_t if pthread_spinlock_t is not supported */ +#include "mercury_thread_mutex.h" +typedef hg_thread_mutex_t HG_LOCK_CAPABILITY("mutex") hg_thread_spin_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Initialize the spin lock. + * + * \param lock [IN/OUT] pointer to lock object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_spin_init(hg_thread_spin_t *lock); + +/** + * Destroy the spin lock. + * + * \param lock [IN/OUT] pointer to lock object + * + * \return Non-negative on success or negative on failure + */ +HG_UTIL_PUBLIC int hg_thread_spin_destroy(hg_thread_spin_t *lock); + +/** + * Lock the spin lock. + * + * \param lock [IN/OUT] pointer to lock object + */ +static HG_UTIL_INLINE void hg_thread_spin_lock(hg_thread_spin_t *lock) HG_LOCK_ACQUIRE(*lock); + +/** + * Try locking the spin lock. + * + * \param mutex [IN/OUT] pointer to lock object + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_thread_spin_try_lock(hg_thread_spin_t *lock) + HG_LOCK_TRY_ACQUIRE(HG_UTIL_SUCCESS, *lock); + +/** + * Unlock the spin lock. + * + * \param mutex [IN/OUT] pointer to lock object + */ +static HG_UTIL_INLINE void hg_thread_spin_unlock(hg_thread_spin_t *lock) HG_LOCK_RELEASE(*lock); + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_spin_lock(hg_thread_spin_t *lock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#if defined(_WIN32) + while (InterlockedExchange(lock, EBUSY)) { + /* Don't lock while waiting */ + while (*lock) { + YieldProcessor(); + + /* Compiler barrier. Prevent caching of *lock */ + MemoryBarrier(); + } + } +#elif defined(HG_UTIL_HAS_PTHREAD_SPINLOCK_T) + (void)pthread_spin_lock(lock); +#else + hg_thread_mutex_lock(lock); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_thread_spin_try_lock(hg_thread_spin_t *lock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#if defined(_WIN32) + return InterlockedExchange(lock, EBUSY); +#elif defined(HG_UTIL_HAS_PTHREAD_SPINLOCK_T) + if (pthread_spin_trylock(lock)) + return HG_UTIL_FAIL; + + return HG_UTIL_SUCCESS; +#else + return hg_thread_mutex_try_lock(lock); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE void +hg_thread_spin_unlock(hg_thread_spin_t *lock) HG_LOCK_NO_THREAD_SAFETY_ANALYSIS +{ +#if defined(_WIN32) + /* Compiler barrier. The store below acts with release semantics */ + MemoryBarrier(); + *lock = 0; +#elif defined(HG_UTIL_HAS_PTHREAD_SPINLOCK_T) + (void)pthread_spin_unlock(lock); +#else + hg_thread_mutex_unlock(lock); +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_THREAD_SPIN_H */ diff --git a/src/mercury/src/util/mercury_time.h b/src/mercury/src/util/mercury_time.h new file mode 100644 index 00000000000..f158638342c --- /dev/null +++ b/src/mercury/src/util/mercury_time.h @@ -0,0 +1,503 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_TIME_H +#define MERCURY_TIME_H + +#include "mercury_util_config.h" + +#if defined(_WIN32) +#include +#elif defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) +#include +#elif defined(__APPLE__) && defined(HG_UTIL_HAS_SYSTIME_H) +#include +#include +#else +#include +#include +#if defined(HG_UTIL_HAS_SYSTIME_H) +#include +#else +#error "Not supported on this platform." +#endif +#endif + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) +typedef struct timespec hg_time_t; +#else +typedef struct hg_time hg_time_t; + +struct hg_time { + long tv_sec; + long tv_usec; +}; +#endif + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Get an elapsed time on the calling processor. + * + * \param tv [OUT] pointer to returned time structure + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_time_get_current(hg_time_t *tv); + +/** + * Get an elapsed time on the calling processor (resolution is ms). + * + * \param tv [OUT] pointer to returned time structure + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_time_get_current_ms(hg_time_t *tv); + +/** + * Convert hg_time_t to double. + * + * \param tv [IN] time structure + * + * \return Converted time in seconds + */ +static HG_UTIL_INLINE double hg_time_to_double(hg_time_t tv); + +/** + * Convert double to hg_time_t. + * + * \param d [IN] time in seconds + * + * \return Converted time structure + */ +static HG_UTIL_INLINE hg_time_t hg_time_from_double(double d); + +/** + * Convert (integer) milliseconds to hg_time_t. + * + * \param ms [IN] time in milliseconds + * + * \return Converted time structure + */ +static HG_UTIL_INLINE hg_time_t hg_time_from_ms(unsigned int ms); + +/** + * Convert hg_time_t to (integer) milliseconds. + * + * \param tv [IN] time structure + * + * \return Time in milliseconds + */ +static HG_UTIL_INLINE unsigned int hg_time_to_ms(hg_time_t tv); + +/** + * Compare time values. + * + * \param in1 [IN] time structure + * \param in2 [IN] time structure + * + * \return 1 if in1 < in2, 0 otherwise + */ +static HG_UTIL_INLINE int hg_time_less(hg_time_t in1, hg_time_t in2); + +/** + * Diff time values and return the number of seconds elapsed between + * time \in2 and time \in1. + * + * \param in2 [IN] time structure + * \param in1 [IN] time structure + * + * \return Subtracted time + */ +static HG_UTIL_INLINE double hg_time_diff(hg_time_t in2, hg_time_t in1); + +/** + * Add time values. + * + * \param in1 [IN] time structure + * \param in2 [IN] time structure + * + * \return Summed time structure + */ +static HG_UTIL_INLINE hg_time_t hg_time_add(hg_time_t in1, hg_time_t in2); + +/** + * Subtract time values. + * + * \param in1 [IN] time structure + * \param in2 [IN] time structure + * + * \return Subtracted time structure + */ +static HG_UTIL_INLINE hg_time_t hg_time_subtract(hg_time_t in1, hg_time_t in2); + +/** + * Sleep until the time specified in rqt has elapsed. + * + * \param reqt [IN] time structure + * + * \return Non-negative on success or negative on failure + */ +static HG_UTIL_INLINE int hg_time_sleep(const hg_time_t rqt); + +/** + * Get a string containing current time/date stamp. + * + * \return Valid string or NULL on failure + */ +static HG_UTIL_INLINE char *hg_time_stamp(void); + +/*---------------------------------------------------------------------------*/ +#ifdef _WIN32 +static HG_UTIL_INLINE LARGE_INTEGER +get_FILETIME_offset(void) +{ + SYSTEMTIME s; + FILETIME f; + LARGE_INTEGER t; + + s.wYear = 1970; + s.wMonth = 1; + s.wDay = 1; + s.wHour = 0; + s.wMinute = 0; + s.wSecond = 0; + s.wMilliseconds = 0; + SystemTimeToFileTime(&s, &f); + t.QuadPart = f.dwHighDateTime; + t.QuadPart <<= 32; + t.QuadPart |= f.dwLowDateTime; + + return t; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_get_current(hg_time_t *tv) +{ + LARGE_INTEGER t; + FILETIME f; + double t_usec; + static LARGE_INTEGER offset; + static double freq_to_usec; + static int initialized = 0; + static BOOL use_perf_counter = 0; + + if (!initialized) { + LARGE_INTEGER perf_freq; + initialized = 1; + use_perf_counter = QueryPerformanceFrequency(&perf_freq); + if (use_perf_counter) { + QueryPerformanceCounter(&offset); + freq_to_usec = (double)perf_freq.QuadPart / 1000000.; + } + else { + offset = get_FILETIME_offset(); + freq_to_usec = 10.; + } + } + if (use_perf_counter) { + QueryPerformanceCounter(&t); + } + else { + GetSystemTimeAsFileTime(&f); + t.QuadPart = f.dwHighDateTime; + t.QuadPart <<= 32; + t.QuadPart |= f.dwLowDateTime; + } + + t.QuadPart -= offset.QuadPart; + t_usec = (double)t.QuadPart / freq_to_usec; + t.QuadPart = t_usec; + tv->tv_sec = t.QuadPart / 1000000; + tv->tv_usec = t.QuadPart % 1000000; + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_get_current_ms(hg_time_t *tv) +{ + return hg_time_get_current(tv); +} + +/*---------------------------------------------------------------------------*/ +#elif defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) +static HG_UTIL_INLINE int +hg_time_get_current(hg_time_t *tv) +{ + clock_gettime(CLOCK_MONOTONIC, tv); + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_get_current_ms(hg_time_t *tv) +{ +/* ppc/32 and ppc/64 do not support CLOCK_MONOTONIC_COARSE in vdso */ +#if defined(__ppc64__) || defined(__ppc__) || defined(__PPC64__) || defined(__PPC__) || \ + !defined(HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE) + clock_gettime(CLOCK_MONOTONIC, tv); +#else + /* We don't need fine grain time stamps, _COARSE resolution is 1ms */ + clock_gettime(CLOCK_MONOTONIC_COARSE, tv); +#endif + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +#elif defined(__APPLE__) && defined(HG_UTIL_HAS_SYSTIME_H) +static HG_UTIL_INLINE int +hg_time_get_current(hg_time_t *tv) +{ + static uint64_t monotonic_timebase_factor = 0; + uint64_t monotonic_nsec; + + if (monotonic_timebase_factor == 0) { + mach_timebase_info_data_t timebase_info; + + (void)mach_timebase_info(&timebase_info); + monotonic_timebase_factor = timebase_info.numer / timebase_info.denom; + } + monotonic_nsec = (mach_absolute_time() * monotonic_timebase_factor); + tv->tv_sec = (long)(monotonic_nsec / 1000000000); + tv->tv_usec = (long)((monotonic_nsec - (uint64_t)tv->tv_sec) / 1000); + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_get_current_ms(hg_time_t *tv) +{ + return hg_time_get_current(tv); +} + +#else +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_get_current(hg_time_t *tv) +{ + gettimeofday((struct timeval *)tv, NULL); + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_get_current_ms(hg_time_t *tv) +{ + return hg_time_get_current(tv); +} + +#endif +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE double +hg_time_to_double(hg_time_t tv) +{ +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + return (double)tv.tv_sec + (double)(tv.tv_nsec) * 0.000000001; +#else + return (double)tv.tv_sec + (double)(tv.tv_usec) * 0.000001; +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_time_t +hg_time_from_double(double d) +{ + hg_time_t tv; + + tv.tv_sec = (long)d; +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + tv.tv_nsec = (long)((d - (double)(tv.tv_sec)) * 1000000000); +#else + tv.tv_usec = (long)((d - (double)(tv.tv_sec)) * 1000000); +#endif + + return tv; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE unsigned int +hg_time_to_ms(hg_time_t tv) +{ +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + return (unsigned int)(tv.tv_sec * 1000 + tv.tv_nsec / 1000000); +#else + return (unsigned int)(tv.tv_sec * 1000 + tv.tv_usec / 1000); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_time_t +hg_time_from_ms(unsigned int ms) +{ +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + return (hg_time_t){.tv_sec = ms / 1000, .tv_nsec = (ms - (ms / 1000) * 1000) * 1000000}; +#else + return (hg_time_t){.tv_sec = ms / 1000, .tv_usec = (ms - (ms / 1000) * 1000) * 1000}; +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_less(hg_time_t in1, hg_time_t in2) +{ + return ((in1.tv_sec < in2.tv_sec) || ((in1.tv_sec == in2.tv_sec) && +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + (in1.tv_nsec < in2.tv_nsec))); +#else + (in1.tv_usec < in2.tv_usec))); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE double +hg_time_diff(hg_time_t in2, hg_time_t in1) +{ +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + return ((double)in2.tv_sec + (double)(in2.tv_nsec) * 0.000000001) - + ((double)in1.tv_sec + (double)(in1.tv_nsec) * 0.000000001); +#else + return ((double)in2.tv_sec + (double)(in2.tv_usec) * 0.000001) - + ((double)in1.tv_sec + (double)(in1.tv_usec) * 0.000001); +#endif +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_time_t +hg_time_add(hg_time_t in1, hg_time_t in2) +{ + hg_time_t out; + + out.tv_sec = in1.tv_sec + in2.tv_sec; +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + out.tv_nsec = in1.tv_nsec + in2.tv_nsec; + if (out.tv_nsec > 1000000000) { + out.tv_nsec -= 1000000000; + out.tv_sec += 1; + } +#else + out.tv_usec = in1.tv_usec + in2.tv_usec; + if (out.tv_usec > 1000000) { + out.tv_usec -= 1000000; + out.tv_sec += 1; + } +#endif + + return out; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE hg_time_t +hg_time_subtract(hg_time_t in1, hg_time_t in2) +{ + hg_time_t out; + + out.tv_sec = in1.tv_sec - in2.tv_sec; +#if defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + out.tv_nsec = in1.tv_nsec - in2.tv_nsec; + if (out.tv_nsec < 0) { + out.tv_nsec += 1000000000; + out.tv_sec -= 1; + } +#else + out.tv_usec = in1.tv_usec - in2.tv_usec; + if (out.tv_usec < 0) { + out.tv_usec += 1000000; + out.tv_sec -= 1; + } +#endif + + return out; +} + +/*---------------------------------------------------------------------------*/ +static HG_UTIL_INLINE int +hg_time_sleep(const hg_time_t rqt) +{ +#ifdef _WIN32 + DWORD dwMilliseconds = (DWORD)(hg_time_to_double(rqt) / 1000); + + Sleep(dwMilliseconds); +#elif defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + if (nanosleep(&rqt, NULL)) + return HG_UTIL_FAIL; +#else + useconds_t usec = (useconds_t)rqt.tv_sec * 1000000 + (useconds_t)rqt.tv_usec; + + if (usleep(usec)) + return HG_UTIL_FAIL; +#endif + + return HG_UTIL_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +#define HG_UTIL_STAMP_MAX 128 +static HG_UTIL_INLINE char * +hg_time_stamp(void) +{ + static char buf[HG_UTIL_STAMP_MAX] = {'\0'}; + +#if defined(_WIN32) + /* TODO not implemented */ +#elif defined(HG_UTIL_HAS_TIME_H) && defined(HG_UTIL_HAS_CLOCK_GETTIME) + struct tm *local_time; + time_t t; + + t = time(NULL); + local_time = localtime(&t); + if (local_time == NULL) + return NULL; + + if (strftime(buf, HG_UTIL_STAMP_MAX, "%a, %d %b %Y %T %Z", local_time) == 0) + return NULL; +#else + struct timeval tv; + struct timezone tz; + unsigned long days, hours, minutes, seconds; + + gettimeofday(&tv, &tz); + days = (unsigned long)tv.tv_sec / (3600 * 24); + hours = ((unsigned long)tv.tv_sec - days * 24 * 3600) / 3600; + minutes = ((unsigned long)tv.tv_sec - days * 24 * 3600 - hours * 3600) / 60; + seconds = (unsigned long)tv.tv_sec - days * 24 * 3600 - hours * 3600 - minutes * 60; + hours -= (unsigned long)tz.tz_minuteswest / 60; + + snprintf(buf, HG_UTIL_STAMP_MAX, "%02lu:%02lu:%02lu (GMT-%d)", hours, minutes, seconds, + tz.tz_minuteswest / 60); +#endif + + return buf; +} + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_TIME_H */ diff --git a/src/mercury/src/util/mercury_util.c b/src/mercury/src/util/mercury_util.c new file mode 100644 index 00000000000..ced8979d05e --- /dev/null +++ b/src/mercury/src/util/mercury_util.c @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#include "mercury_util.h" + +#include "mercury_util_error.h" + +#include +#include + +/****************/ +/* Local Macros */ +/****************/ + +/* Name of this subsystem */ +#define HG_UTIL_SUBSYS_NAME hg_util +#define HG_UTIL_STRINGIFY1(x) HG_UTIL_STRINGIFY(x) +#define HG_UTIL_SUBSYS_NAME_STRING HG_UTIL_STRINGIFY1(HG_UTIL_SUBSYS_NAME) + +/*******************/ +/* Local Variables */ +/*******************/ + +/* Default error log mask */ +HG_LOG_SUBSYS_DECL_REGISTER(HG_UTIL_SUBSYS_NAME, hg); + +/*---------------------------------------------------------------------------*/ +void +HG_Util_set_log_level(const char *level) +{ + hg_log_set_subsys_level(HG_UTIL_SUBSYS_NAME_STRING, hg_log_name_to_level(level)); +} diff --git a/src/mercury/src/util/mercury_util.h b/src/mercury/src/util/mercury_util.h new file mode 100644 index 00000000000..1e36e266049 --- /dev/null +++ b/src/mercury/src/util/mercury_util.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_UTIL_LOG_H +#define MERCURY_UTIL_LOG_H + +#include "mercury_util_config.h" + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/*****************/ +/* Public Macros */ +/*****************/ + +/*********************/ +/* Public Prototypes */ +/*********************/ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Set the log level for HG util. That setting is valid for all HG classes. + * + * \param level [IN] level string, valid values are: + * "none", "error", "warning", "debug" + */ +HG_UTIL_PUBLIC void HG_Util_set_log_level(const char *level); + +#ifdef __cplusplus +} +#endif + +#endif /* MERCURY_UTIL_LOG_H */ diff --git a/src/mercury/src/util/mercury_util_config.h b/src/mercury/src/util/mercury_util_config.h new file mode 100644 index 00000000000..8237b4df409 --- /dev/null +++ b/src/mercury/src/util/mercury_util_config.h @@ -0,0 +1,154 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Generated file. Only edit mercury_util_config.h.in. */ + +#ifndef MERCURY_UTIL_CONFIG_H +#define MERCURY_UTIL_CONFIG_H + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/* Type definitions */ +#ifdef _WIN32 +typedef signed __int64 hg_util_int64_t; +typedef signed __int32 hg_util_int32_t; +typedef signed __int16 hg_util_int16_t; +typedef signed __int8 hg_util_int8_t; +typedef unsigned __int64 hg_util_uint64_t; +typedef unsigned __int32 hg_util_uint32_t; +typedef unsigned __int16 hg_util_uint16_t; +typedef unsigned __int8 hg_util_uint8_t; +#else +#include +#include +typedef int64_t hg_util_int64_t; +typedef int32_t hg_util_int32_t; +typedef int16_t hg_util_int16_t; +typedef int8_t hg_util_int8_t; +typedef uint64_t hg_util_uint64_t; +typedef uint32_t hg_util_uint32_t; +typedef uint16_t hg_util_uint16_t; +typedef uint8_t hg_util_uint8_t; +#endif +typedef hg_util_uint8_t hg_util_bool_t; +typedef hg_util_uint64_t hg_util_ptr_t; + +/* True / false */ +#define HG_UTIL_TRUE 1 +#define HG_UTIL_FALSE 0 + +/* Return codes */ +#define HG_UTIL_SUCCESS 0 +#define HG_UTIL_FAIL -1 + +/*****************/ +/* Public Macros */ +/*****************/ + +/* Visibility of symbols */ +#if defined(_WIN32) +#define HG_UTIL_ABI_IMPORT __declspec(dllimport) +#define HG_UTIL_ABI_EXPORT __declspec(dllexport) +#define HG_UTIL_ABI_HIDDEN +#elif defined(__GNUC__) && (__GNUC__ >= 4) +#define HG_UTIL_ABI_IMPORT __attribute__((visibility("default"))) +#define HG_UTIL_ABI_EXPORT __attribute__((visibility("default"))) +#define HG_UTIL_ABI_HIDDEN __attribute__((visibility("hidden"))) +#else +#define HG_UTIL_ABI_IMPORT +#define HG_UTIL_ABI_EXPORT +#define HG_UTIL_ABI_HIDDEN +#endif + +/* Inline macro */ +#ifdef _WIN32 +#define HG_UTIL_INLINE __inline +#else +#define HG_UTIL_INLINE __inline__ +#endif + +/* Check format arguments */ +#if defined(__GNUC__) +#define HG_UTIL_PRINTF_LIKE(_fmt, _firstarg) __attribute__((format(printf, _fmt, _firstarg))) +#else +#define HG_UTIL_PRINTF_LIKE(_fmt, _firstarg) +#endif + +/* Shared libraries */ +/* #undef HG_UTIL_BUILD_SHARED_LIBS */ +#ifdef HG_UTIL_BUILD_SHARED_LIBS +#ifdef mercury_util_EXPORTS +#define HG_UTIL_PUBLIC HG_UTIL_ABI_EXPORT +#else +#define HG_UTIL_PUBLIC HG_UTIL_ABI_IMPORT +#endif +#define HG_UTIL_PRIVATE HG_UTIL_ABI_HIDDEN +#else +#define HG_UTIL_PUBLIC +#define HG_UTIL_PRIVATE +#endif + +/* Define if has __attribute__((constructor)) */ +#define HG_UTIL_HAS_ATTR_CONSTRUCTOR + +/* Define if has __attribute__((constructor(priority))) */ +#define HG_UTIL_HAS_ATTR_CONSTRUCTOR_PRIORITY + +/* Define if has 'clock_gettime()' */ +#define HG_UTIL_HAS_CLOCK_GETTIME + +/* Define if has CLOCK_MONOTONIC_COARSE */ +#define HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE + +/* Define is has debug */ +/* #undef HG_UTIL_HAS_DEBUG */ + +/* Define if has eventfd_t type */ +#define HG_UTIL_HAS_EVENTFD_T + +/* Define if has colored output */ +/* #undef HG_UTIL_HAS_LOG_COLOR */ + +/* Define if has */ +/* #undef HG_UTIL_HAS_OPA_PRIMITIVES_H */ + +/* Define if has 'pthread_condattr_setclock()' */ +#define HG_UTIL_HAS_PTHREAD_CONDATTR_SETCLOCK + +/* Define if has PTHREAD_MUTEX_ADAPTIVE_NP */ +#define HG_UTIL_HAS_PTHREAD_MUTEX_ADAPTIVE_NP + +/* Define if has pthread_spinlock_t type */ +#define HG_UTIL_HAS_PTHREAD_SPINLOCK_T + +/* Define if has */ +#define HG_UTIL_HAS_STDATOMIC_H + +/* Define type size of atomic_long */ +#define HG_UTIL_ATOMIC_LONG_WIDTH 8 + +/* Define if has */ +#define HG_UTIL_HAS_SYSEPOLL_H + +/* Define if has */ +/* #undef HG_UTIL_HAS_SYSEVENT_H */ + +/* Define if has */ +#define HG_UTIL_HAS_SYSEVENTFD_H + +/* Define if has */ +#define HG_UTIL_HAS_SYSTIME_H + +/* Define if has */ +#define HG_UTIL_HAS_TIME_H + +#endif /* MERCURY_UTIL_CONFIG_H */ diff --git a/src/mercury/src/util/mercury_util_config.h.in b/src/mercury/src/util/mercury_util_config.h.in new file mode 100644 index 00000000000..f3a04cd91b3 --- /dev/null +++ b/src/mercury/src/util/mercury_util_config.h.in @@ -0,0 +1,155 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +/* Generated file. Only edit mercury_util_config.h.in. */ + +#ifndef MERCURY_UTIL_CONFIG_H +#define MERCURY_UTIL_CONFIG_H + +/*************************************/ +/* Public Type and Struct Definition */ +/*************************************/ + +/* Type definitions */ +#ifdef _WIN32 +typedef signed __int64 hg_util_int64_t; +typedef signed __int32 hg_util_int32_t; +typedef signed __int16 hg_util_int16_t; +typedef signed __int8 hg_util_int8_t; +typedef unsigned __int64 hg_util_uint64_t; +typedef unsigned __int32 hg_util_uint32_t; +typedef unsigned __int16 hg_util_uint16_t; +typedef unsigned __int8 hg_util_uint8_t; +#else +# include +# include +typedef int64_t hg_util_int64_t; +typedef int32_t hg_util_int32_t; +typedef int16_t hg_util_int16_t; +typedef int8_t hg_util_int8_t; +typedef uint64_t hg_util_uint64_t; +typedef uint32_t hg_util_uint32_t; +typedef uint16_t hg_util_uint16_t; +typedef uint8_t hg_util_uint8_t; +#endif +typedef hg_util_uint8_t hg_util_bool_t; +typedef hg_util_uint64_t hg_util_ptr_t; + +/* True / false */ +#define HG_UTIL_TRUE 1 +#define HG_UTIL_FALSE 0 + +/* Return codes */ +#define HG_UTIL_SUCCESS 0 +#define HG_UTIL_FAIL -1 + +/*****************/ +/* Public Macros */ +/*****************/ + +/* Visibility of symbols */ +#if defined(_WIN32) +# define HG_UTIL_ABI_IMPORT __declspec(dllimport) +# define HG_UTIL_ABI_EXPORT __declspec(dllexport) +# define HG_UTIL_ABI_HIDDEN +#elif defined(__GNUC__) && (__GNUC__ >= 4) +# define HG_UTIL_ABI_IMPORT __attribute__((visibility("default"))) +# define HG_UTIL_ABI_EXPORT __attribute__((visibility("default"))) +# define HG_UTIL_ABI_HIDDEN __attribute__((visibility("hidden"))) +#else +# define HG_UTIL_ABI_IMPORT +# define HG_UTIL_ABI_EXPORT +# define HG_UTIL_ABI_HIDDEN +#endif + +/* Inline macro */ +#ifdef _WIN32 +# define HG_UTIL_INLINE __inline +#else +# define HG_UTIL_INLINE __inline__ +#endif + +/* Check format arguments */ +#if defined(__GNUC__) +# define HG_UTIL_PRINTF_LIKE(_fmt, _firstarg) \ + __attribute__((format(printf, _fmt, _firstarg))) +#else +# define HG_UTIL_PRINTF_LIKE(_fmt, _firstarg) +#endif + +/* Shared libraries */ +#cmakedefine HG_UTIL_BUILD_SHARED_LIBS +#ifdef HG_UTIL_BUILD_SHARED_LIBS +# ifdef mercury_util_EXPORTS +# define HG_UTIL_PUBLIC HG_UTIL_ABI_EXPORT +# else +# define HG_UTIL_PUBLIC HG_UTIL_ABI_IMPORT +# endif +# define HG_UTIL_PRIVATE HG_UTIL_ABI_HIDDEN +#else +# define HG_UTIL_PUBLIC +# define HG_UTIL_PRIVATE +#endif + +/* Define if has __attribute__((constructor)) */ +#cmakedefine HG_UTIL_HAS_ATTR_CONSTRUCTOR + +/* Define if has __attribute__((constructor(priority))) */ +#cmakedefine HG_UTIL_HAS_ATTR_CONSTRUCTOR_PRIORITY + +/* Define if has 'clock_gettime()' */ +#cmakedefine HG_UTIL_HAS_CLOCK_GETTIME + +/* Define if has CLOCK_MONOTONIC_COARSE */ +#cmakedefine HG_UTIL_HAS_CLOCK_MONOTONIC_COARSE + +/* Define is has debug */ +#cmakedefine HG_UTIL_HAS_DEBUG + +/* Define if has eventfd_t type */ +#cmakedefine HG_UTIL_HAS_EVENTFD_T + +/* Define if has colored output */ +#cmakedefine HG_UTIL_HAS_LOG_COLOR + +/* Define if has */ +#cmakedefine HG_UTIL_HAS_OPA_PRIMITIVES_H + +/* Define if has 'pthread_condattr_setclock()' */ +#cmakedefine HG_UTIL_HAS_PTHREAD_CONDATTR_SETCLOCK + +/* Define if has PTHREAD_MUTEX_ADAPTIVE_NP */ +#cmakedefine HG_UTIL_HAS_PTHREAD_MUTEX_ADAPTIVE_NP + +/* Define if has pthread_spinlock_t type */ +#cmakedefine HG_UTIL_HAS_PTHREAD_SPINLOCK_T + +/* Define if has */ +#cmakedefine HG_UTIL_HAS_STDATOMIC_H + +/* Define type size of atomic_long */ +#cmakedefine HG_UTIL_ATOMIC_LONG_WIDTH @HG_UTIL_ATOMIC_LONG_WIDTH@ + +/* Define if has */ +#cmakedefine HG_UTIL_HAS_SYSEPOLL_H + +/* Define if has */ +#cmakedefine HG_UTIL_HAS_SYSEVENT_H + +/* Define if has */ +#cmakedefine HG_UTIL_HAS_SYSEVENTFD_H + +/* Define if has */ +#cmakedefine HG_UTIL_HAS_SYSTIME_H + +/* Define if has */ +#cmakedefine HG_UTIL_HAS_TIME_H + +#endif /* MERCURY_UTIL_CONFIG_H */ diff --git a/src/mercury/src/util/mercury_util_error.h b/src/mercury/src/util/mercury_util_error.h new file mode 100644 index 00000000000..bcf51b70504 --- /dev/null +++ b/src/mercury/src/util/mercury_util_error.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2013-2020 Argonne National Laboratory, Department of Energy, + * UChicago Argonne, LLC and The HDF Group. + * All rights reserved. + * + * The full copyright notice, including terms governing use, modification, + * and redistribution, is contained in the COPYING file that can be + * found at the root of the source code distribution tree. + */ + +#ifndef MERCURY_UTIL_ERROR_H +#define MERCURY_UTIL_ERROR_H + +#include "mercury_util_config.h" + +/* Default error macro */ +#include +extern HG_UTIL_PRIVATE HG_LOG_OUTLET_DECL(hg_util); +#define HG_UTIL_LOG_ERROR(...) HG_LOG_WRITE(hg_util, HG_LOG_LEVEL_ERROR, __VA_ARGS__) +#define HG_UTIL_LOG_WARNING(...) HG_LOG_WRITE(hg_util, HG_LOG_LEVEL_WARNING, __VA_ARGS__) +#ifdef HG_UTIL_HAS_DEBUG +#define HG_UTIL_LOG_DEBUG(...) HG_LOG_WRITE(hg_util, HG_LOG_LEVEL_DEBUG, __VA_ARGS__) +#else +#define HG_UTIL_LOG_DEBUG(...) (void)0 +#endif + +/* Branch predictor hints */ +#ifndef _WIN32 +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define likely(x) (x) +#define unlikely(x) (x) +#endif + +/* Error macros */ +#define HG_UTIL_GOTO_DONE(label, ret, ret_val) \ + do { \ + ret = ret_val; \ + goto label; \ + } while (0) + +#define HG_UTIL_GOTO_ERROR(label, ret, err_val, ...) \ + do { \ + HG_UTIL_LOG_ERROR(__VA_ARGS__); \ + ret = err_val; \ + goto label; \ + } while (0) + +/* Check for cond, set ret to err_val and goto label */ +#define HG_UTIL_CHECK_ERROR(cond, label, ret, err_val, ...) \ + do { \ + if (unlikely(cond)) { \ + HG_UTIL_LOG_ERROR(__VA_ARGS__); \ + ret = err_val; \ + goto label; \ + } \ + } while (0) + +#define HG_UTIL_CHECK_ERROR_NORET(cond, label, ...) \ + do { \ + if (unlikely(cond)) { \ + HG_UTIL_LOG_ERROR(__VA_ARGS__); \ + goto label; \ + } \ + } while (0) + +#define HG_UTIL_CHECK_ERROR_DONE(cond, ...) \ + do { \ + if (unlikely(cond)) { \ + HG_UTIL_LOG_ERROR(__VA_ARGS__); \ + } \ + } while (0) + +/* Check for cond and print warning */ +#define HG_UTIL_CHECK_WARNING(cond, ...) \ + do { \ + if (unlikely(cond)) { \ + HG_UTIL_LOG_WARNING(__VA_ARGS__); \ + } \ + } while (0) + +#endif /* MERCURY_UTIL_ERROR_H */ diff --git a/src/mercury/version.txt b/src/mercury/version.txt new file mode 100644 index 00000000000..0c271bcf956 --- /dev/null +++ b/src/mercury/version.txt @@ -0,0 +1 @@ +2.1.0rc1 diff --git a/test/AtomicWriterReader.txt b/test/AtomicWriterReader.txt index dc0a3bdc6c8..064ba392129 100644 --- a/test/AtomicWriterReader.txt +++ b/test/AtomicWriterReader.txt @@ -11,7 +11,7 @@ atomic_reader.c: is the "read" part of the test. Building the Tests ------------------ -The two test parts are automically built during configure and make process. +The two test parts are automatically built during configure and make process. But to build them individually, you can do in test/ directory: $ gcc atomic_writer $ gcc atomic_reader diff --git a/test/dsets.c b/test/dsets.c index 88e3ce05ff8..c24d746a7e8 100644 --- a/test/dsets.c +++ b/test/dsets.c @@ -8845,7 +8845,7 @@ test_chunk_cache(hid_t fapl) if ((dsid = H5Dcreate2(fid, "dset", H5T_NATIVE_INT, sid, H5P_DEFAULT, dcpl, dapl1)) < 0) FAIL_STACK_ERROR - /* Retrieve dapl from dataset, verfiy cache values are the same as on fapl_local */ + /* Retrieve dapl from dataset, verify cache values are the same as on fapl_local */ if ((dapl2 = H5Dget_access_plist(dsid)) < 0) FAIL_STACK_ERROR if (H5Pget_chunk_cache(dapl2, &nslots_4, &nbytes_4, &w0_4) < 0) @@ -8869,7 +8869,7 @@ test_chunk_cache(hid_t fapl) if ((dsid = H5Oopen(fid, "dset", dapl1)) < 0) FAIL_STACK_ERROR - /* Retrieve dapl from dataset, verfiy cache values are the same as on dapl1 */ + /* Retrieve dapl from dataset, verify cache values are the same as on dapl1 */ /* Note we rely on the knowledge that H5Pget_chunk_cache retrieves these * values directly from the dataset structure, and not from a copy of the * dapl used to open the dataset (which is not preserved). @@ -8889,7 +8889,7 @@ test_chunk_cache(hid_t fapl) if ((dsid = H5Dopen2(fid, "dset", H5P_DEFAULT)) < 0) FAIL_STACK_ERROR - /* Retrieve dapl from dataset, verfiy cache values are the same on fapl_local */ + /* Retrieve dapl from dataset, verify cache values are the same on fapl_local */ if ((dapl2 = H5Dget_access_plist(dsid)) < 0) FAIL_STACK_ERROR if (H5Pget_chunk_cache(dapl2, &nslots_4, &nbytes_4, &w0_4) < 0) diff --git a/test/page_buffer.c b/test/page_buffer.c index 64e88fb35ff..558b4e99184 100644 --- a/test/page_buffer.c +++ b/test/page_buffer.c @@ -370,7 +370,7 @@ set_multi_split(const char *env_h5_drvr, hid_t fapl, hsize_t pagesize) * 1) verifying that API errors are caught. * * 2) verifying that the page buffer behaves more or less - * as advertized. + * as advertised. * * Any data mis-matches or unexpected failures or successes * reported by the HDF5 library result in test failure. diff --git a/test/swmr_reader.c b/test/swmr_reader.c index e849f6750ce..db3eba0cf24 100644 --- a/test/swmr_reader.c +++ b/test/swmr_reader.c @@ -275,7 +275,7 @@ read_records(const char *filename, hbool_t verbose, FILE *verbose_file, unsigned if ((fapl = h5_fileaccess()) < 0) return -1; - /* Log I/O when verbose output it enbabled */ + /* Log I/O when verbose output it enabled */ if (use_log_vfd) { char verbose_name[1024]; diff --git a/test/test_usecases.sh.in b/test/test_usecases.sh.in index da32827106e..1010792f2a0 100644 --- a/test/test_usecases.sh.in +++ b/test/test_usecases.sh.in @@ -43,6 +43,11 @@ if test -z "$testdir"; then testdir=. fi +# If the testdir directory is not set just use current (.). +if test -z "$testdir"; then + testdir=. +fi + # Check to see if the VFD specified by the HDF5_DRIVER environment variable # supports SWMR. $utils_testdir/swmr_check_compat_vfd diff --git a/test/testflushrefresh.sh.in b/test/testflushrefresh.sh.in index 37d1c7e5ec6..9306271cdd6 100644 --- a/test/testflushrefresh.sh.in +++ b/test/testflushrefresh.sh.in @@ -72,6 +72,16 @@ if test -z "$testdir"; then testdir=. fi +# If the bindir directory is not set just use current (.). +if test -z "$bindir"; then + bindir=. +fi + +# If the testdir directory is not set just use current (.). +if test -z "$testdir"; then + testdir=. +fi + # Check to see if the VFD specified by the HDF5_DRIVER environment variable # supports SWMR. $utils_testdir/swmr_check_compat_vfd diff --git a/test/testswmr.sh.in b/test/testswmr.sh.in index 67363490586..f02d3e69155 100644 --- a/test/testswmr.sh.in +++ b/test/testswmr.sh.in @@ -100,6 +100,11 @@ if test -z "$testdir"; then testdir=. fi +# If the testdir directory is not set just use current (.). +if test -z "$testdir"; then + testdir=. +fi + # Check to see if the VFD specified by the HDF5_DRIVER environment variable # supports SWMR. $utils_testdir/swmr_check_compat_vfd diff --git a/test/testvdsswmr.sh.in b/test/testvdsswmr.sh.in index c9aed136fda..d285cab1269 100644 --- a/test/testvdsswmr.sh.in +++ b/test/testvdsswmr.sh.in @@ -86,6 +86,11 @@ if test -z "$testdir"; then testdir=. fi +# If the testdir directory is not set just use current (.). +if test -z "$testdir"; then + testdir=. +fi + # Check to see if the VFD specified by the HDF5_DRIVER environment variable # supports SWMR. $utils_testdir/swmr_check_compat_vfd diff --git a/testpar/CMakeLists.txt b/testpar/CMakeLists.txt index ff4446ce974..bbe80303e26 100644 --- a/testpar/CMakeLists.txt +++ b/testpar/CMakeLists.txt @@ -88,6 +88,7 @@ set (H5P_TESTS t_init_term t_shapesame t_filters_parallel + t_subfiling_vfd t_2Gio ) diff --git a/testpar/Makefile.am b/testpar/Makefile.am index cbde0c1e680..fd4af4a4d17 100644 --- a/testpar/Makefile.am +++ b/testpar/Makefile.am @@ -30,7 +30,7 @@ check_SCRIPTS = $(TEST_SCRIPT_PARA) # Test programs. These are our main targets. # -TEST_PROG_PARA=t_mpi t_bigio testphdf5 t_cache t_cache_image t_pread t_pshutdown t_prestart t_init_term t_shapesame t_filters_parallel t_2Gio t_vfd +TEST_PROG_PARA=t_mpi t_bigio testphdf5 t_cache t_cache_image t_pread t_pshutdown t_prestart t_init_term t_shapesame t_filters_parallel t_2Gio t_vfd t_subfiling_vfd # t_pflush1 and t_pflush2 are used by testpflush.sh check_PROGRAMS = $(TEST_PROG_PARA) t_pflush1 t_pflush2 diff --git a/testpar/t_2Gio.c b/testpar/t_2Gio.c index 2be4ae401a9..5dbea085e27 100644 --- a/testpar/t_2Gio.c +++ b/testpar/t_2Gio.c @@ -3644,7 +3644,7 @@ test_actual_io_mode(int selection_mode) /* Set the threshold number of processes per chunk to twice mpi_size. * This will prevent the threshold from ever being met, thus forcing * multi chunk io instead of link chunk io. - * This is via deault. + * This is via default. */ if (multi_chunk_io) { /* force multi-chunk-io by threshold */ diff --git a/testpar/t_cache.c b/testpar/t_cache.c index 70ada014304..8559afb346a 100644 --- a/testpar/t_cache.c +++ b/testpar/t_cache.c @@ -6724,7 +6724,7 @@ smoke_check_6(int metadata_write_strategy) if (FALSE != entry_ptr->header.coll_access) { nerrors++; if (verbose) { - HDfprintf(stdout, "%d:%s: Entry inserted indepedently marked as collective.\n", + HDfprintf(stdout, "%d:%s: Entry inserted independently marked as collective.\n", world_mpi_rank, __func__); } } @@ -6780,7 +6780,7 @@ smoke_check_6(int metadata_write_strategy) if (FALSE != entry_ptr->header.coll_access) { nerrors++; if (verbose) { - HDfprintf(stdout, "%d:%s: Entry inserted indepedently marked as collective.\n", + HDfprintf(stdout, "%d:%s: Entry inserted independently marked as collective.\n", world_mpi_rank, __func__); } } diff --git a/testpar/t_dset.c b/testpar/t_dset.c index dc5673a621a..cc950d8557a 100644 --- a/testpar/t_dset.c +++ b/testpar/t_dset.c @@ -3202,7 +3202,7 @@ test_actual_io_mode(int selection_mode) /* Set the threshold number of processes per chunk to twice mpi_size. * This will prevent the threshold from ever being met, thus forcing * multi chunk io instead of link chunk io. - * This is via deault. + * This is via default. */ if (multi_chunk_io) { /* force multi-chunk-io by threshold */ diff --git a/testpar/t_subfiling_vfd.c b/testpar/t_subfiling_vfd.c new file mode 100644 index 00000000000..4ac0b326bd0 --- /dev/null +++ b/testpar/t_subfiling_vfd.c @@ -0,0 +1,2750 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Purpose: Test the Subfling VFD functionality. + */ + +/* WARNING: The use of realpath() is probably system-dependent, as are + * other things here such as the socket calls. + * Notable to realpath() in particular is the use of "PATH_MAX", which + * apparently has some major potential issues if paths are abused. + * http://insanecoding.blogspot.com/2007/11/pathmax-simply-isnt.html + * so BE CAREFUL about the paths we throw around? + */ + +#include "h5test.h" +#include "cache_common.h" +#include "genall5.h" + +#include +#define H5_HAVE_SUBFILING_VFD +#ifdef H5_HAVE_SUBFILING_VFD + +#include "H5FDsubfiling.h" /* Private header for the subfiling VFD */ +#include "H5FDioc.h" + +#define BIG_DATABUFFER_SIZE 33554432 +/* #define BIG_DATABUFFER_SIZE 16777216 */ +/* #define BIG_DATABUFFER_SIZE 16000000 */ +#define DATABUFFER_SIZE 128 +#define DSET_NAME_LEN 16 + +/* Parameters for the "large chunked dataset" writing */ +#define MAX_DSET_COUNT 255 +#define DSET_DIM 32 +#define CHUNK_DIM 8 + +#define CONCURRENT_COUNT 3 /* Number of files in concurrent test */ + +/* Macro: LOGPRINT() + * Prints logging and debugging messages to the output stream based + * on the level of verbosity. + * 0 : no logging + * 1 : errors only + * 2 : details + * 3 : all + */ +#define DEFAULT_VERBOSITY 1 +static unsigned int g_verbosity = DEFAULT_VERBOSITY; + +int g_mpi_size = -1; +int g_mpi_rank = -1; + +/* Macro for selective debug printing / logging */ +#define LOGPRINT(lvl, ...) \ + do { \ + if ((lvl) <= g_verbosity) { \ + fprintf(g_log_stream, __VA_ARGS__); \ + fflush(g_log_stream); \ + } \ + } while (0) + +#define SUBFILING_RW_DIR "subfiling_rw/" +#define SUBFILING_WO_DIR "subfiling_wo/" + +/* String buffer for error messages */ +#define MIRR_MESG_SIZE 128 +static char mesg[MIRR_MESG_SIZE + 1]; + +/* Convenience structure for passing file names via helper functions. + */ +struct subfilingtest_filenames { + char rw[H5FD_SPLITTER_PATH_MAX + 1]; + char wo[H5FD_SPLITTER_PATH_MAX + 1]; + char log[H5FD_SPLITTER_PATH_MAX + 1]; +}; + +static FILE *g_log_stream = NULL; /* initialized at runtime */ + +static herr_t _verify_datasets(unsigned min_dset, unsigned max_dset, hid_t *filespace_id, hid_t *dataset_id, + hid_t memspace_id); + +static herr_t _create_chunking_ids(hid_t file_id, unsigned min_dset, unsigned max_dset, hsize_t *chunk_dims, + hsize_t *dset_dims, hid_t *dataspace_ids, hid_t *filespace_ids, + hid_t *dataset_ids, hid_t *memspace_id); + +static herr_t _close_chunking_ids(unsigned min_dset, unsigned max_dset, hid_t *dataspace_ids, + hid_t *filespace_ids, hid_t *dataset_ids, hid_t *memspace_id); + +static herr_t _populate_filepath(const char *dirname, const char *_basename, hid_t fapl_id, char *path_out, + hbool_t h5suffix); + +static hid_t create_subfiling_ioc_fapl(const char *_basename, struct subfilingtest_filenames *names); + +static void mybzero(void *dest, size_t size); + +/* ---------------------------------------------------------------------------- + * Function: mybzero + * + * Purpose: Have bzero simplicity and abstraction in (possible) absence of + * it being available. + * + * Programmer: Jacob Smith + * 2020-03-30 + * ---------------------------------------------------------------------------- + */ +static void +mybzero(void *dest, size_t size) +{ + size_t i = 0; + char * s = NULL; + HDassert(dest != NULL); + s = (char *)dest; + for (i = 0; i < size; i++) { + *(s + i) = 0; + } +} /* end mybzero() */ + +/* ---------------------------------------------------------------------------- + * Function: _get_subfiling_extension_info + * + * Purpose: This function returns an instance of a driver_info_t + * structure (shown below) + * (byte) (byte) (byte) (byte) + * +----------+----------+----------+----------+ + * | Version | //////// | //////// | //////// | + * +----------+----------+----------+----------+ + * | Driver Information length (4 bytes) | + * +----------+----------+----------+----------+ + * | S F c o | + * | n f i g | + * +----------+----------+----------+----------+ ---- + * | Driver Information (Data) | ^ + * | Maximum data size is 64kb | Info length + * | ... | v + * +----------+----------+----------+----------+ ---- + * + * The Driver Info Message (returned by this function) + * should be written with a Header Message Type: 0x0014 + * + * See: https://support.hdfgroup.org/HDF5/doc/H5.format.html#DrvInfoMessage + * ---------------------------------------------------------------------------- + */ +static void * +_get_subfiling_extension_info() +{ + return NULL; +} + +/* ---------------------------------------------------------------------------- + * Function: _populate_filepath + * + * Purpose: Given a directory name and a base name, concatenate the two and + * run h5fixname() to get the "actual" path to the intended target. + * `h5suffix' should be FALSE to keep the base name unaltered; + * TRUE will append the '.h5' h5suffix to the basename... + * FALSE -> h5fixname_no_suffix(), TRUE -> h5fixname() + * / / <_basename> + * + * Programmer: Jacob Smith + * 2019-08-16 + * ---------------------------------------------------------------------------- + */ +static herr_t +_populate_filepath(const char *dirname, const char *_basename, hid_t fapl_id, char *path_out, + hbool_t h5suffix) +{ + char _path[H5FD_SPLITTER_PATH_MAX]; + + if ((_basename == NULL) || (*_basename == 0) || (dirname == NULL) || (*dirname == 0) || + (path_out == NULL)) { + TEST_ERROR; + } + + if (HDsnprintf(_path, H5FD_SPLITTER_PATH_MAX, "%s%s%s", dirname, + (dirname[strlen(dirname)] == '/') ? "" : "/", /* slash iff needed */ + _basename) > H5FD_SPLITTER_PATH_MAX) { + TEST_ERROR; + } + + if (h5suffix == TRUE) { + if (h5_fixname(_path, fapl_id, path_out, H5FD_SPLITTER_PATH_MAX) == NULL) { + TEST_ERROR; + } + } + else { + if (h5_fixname_no_suffix(_path, fapl_id, path_out, H5FD_SPLITTER_PATH_MAX) == NULL) { + TEST_ERROR; + } + } + + return SUCCEED; + +error: + return FAIL; +} /* end _populate_filepath() */ + +/* --------------------------------------------------------------------------- + * Function: build_paths + * + * Purpose: Convenience function to create the three file paths used in + * most subfiling tests. + * + * Return: SUCCEED/FAIL + * + * Programmer: Jacob Smith + * 2019-08-16 + * --------------------------------------------------------------------------- + */ +static herr_t +build_paths(const char *_basename, H5FD_subfiling_config_t *subfiling_config, + struct subfilingtest_filenames *names) +{ + char baselogname[H5FD_SUBFILING_PATH_MAX + 1]; + char temp[H5FD_SUBFILING_PATH_MAX + 1]; + char *_realpath = NULL; + + if (_populate_filepath(SUBFILING_RW_DIR, _basename, subfiling_config->common.ioc_fapl_id, names->rw, + TRUE) == FAIL) { + TEST_ERROR; + } + if (names->rw) { + _realpath = HDrealpath(names->rw, temp); + strncpy(subfiling_config->common.file_path, temp, sizeof(subfiling_config->common.file_path)); + strcpy(subfiling_config->common.file_dir, dirname(temp)); + } + if (_basename == NULL || *_basename == 0) + return FAIL; + + return SUCCEED; + +error: + return FAIL; +} /* end build_paths() */ + +/* --------------------------------------------------------------------------- + * Function: test_fapl_configuration + * + * Purpose: Test FAPL configuration and examination. + * + * Return: Success: 0 + * Failure: -1 + * + * Programmer: Jacob Smith + * 2019-03-12 + * --------------------------------------------------------------------------- + */ +static int +test_fapl_configuration(void) +{ + hid_t fapl_id, under_fapl; + H5FD_ioc_config_t ioc_config; + H5FD_subfiling_config_t subfiling_conf; + + TESTING("Subfiling fapl configuration (set/get)"); + + memset(&ioc_config, 0, sizeof(ioc_config)); + memset(&subfiling_conf, 0, sizeof(subfiling_conf)); + + under_fapl = H5Pcreate(H5P_FILE_ACCESS); + if (H5I_INVALID_HID == under_fapl) { + TEST_ERROR; + } + /* Get IOC VFD defaults */ + if (H5Pget_fapl_ioc(under_fapl, &ioc_config) == FAIL) { + TEST_ERROR; + } + /* Now we can set the fapl. */ + if (H5Pset_fapl_ioc(under_fapl, &ioc_config) == FAIL) { + TEST_ERROR; + } + + fapl_id = H5Pcreate(H5P_FILE_ACCESS); + if (H5I_INVALID_HID == fapl_id) { + TEST_ERROR; + } + /* The get_fapl will fill in the default values */ + if (H5Pget_fapl_subfiling(fapl_id, &subfiling_conf) == FAIL) { + TEST_ERROR; + } + /* Now we can set the fapl. */ + if (H5Pset_fapl_subfiling(under_fapl, &subfiling_conf) == FAIL) { + TEST_ERROR; + } + + if (H5Pclose(under_fapl) == FAIL) { + TEST_ERROR; + } + if (H5Pclose(fapl_id) == FAIL) { + TEST_ERROR; + } + + PASSED(); + return 0; + +error: + if (H5I_INVALID_HID != under_fapl) { + (void)H5Pclose(under_fapl); + } + if (H5I_INVALID_HID != fapl_id) { + (void)H5Pclose(fapl_id); + } + + return -1; +} /* end test_fapl_configuration() */ + +#define PRINT_BUFFER_DIFF(act, exp, len) \ + do { \ + size_t _x = 0; \ + while ((act)[_x] == (exp)[_x]) { \ + _x++; \ + } \ + if (_x != (len)) { \ + size_t _y = 0; \ + HDprintf("First bytes differ at %zu\n", _x); \ + HDprintf("exp "); \ + for (_y = _x; _y < (len); _y++) { \ + HDprintf("%02X", (unsigned char)(exp)[_y]); \ + } \ + HDprintf("\nact "); \ + for (_y = _x; _y < (len); _y++) { \ + HDprintf("%02X", (unsigned char)(act)[_y]); \ + } \ + HDprintf("\n"); \ + s \ + } \ + } while (0); /* end PRINT_BUFFER_DIFF */ + +#if 0 +/* --------------------------------------------------------------------------- + * Function: test_xmit_encode_decode + * + * Purpose: Test byte-encoding operations for network transport. + * + * Return: Success: 0 + * Failure: -1 + * + * Programmer: Jacob Smith + * 2020-02-02 + * --------------------------------------------------------------------------- + */ +static int +test_xmit_encode_decode(void) +{ + H5FD_subfiling_xmit_t xmit_mock; /* re-used header in various xmit tests */ + + TESTING("Subfiling encode/decode of xmit elements"); + + /* Set bogus values matching expected; encoding doesn't care + * Use sequential values to easily generate the expected buffer with a + * for loop. + */ + xmit_mock.magic = 0x00010203; + xmit_mock.version = 0x04; + xmit_mock.session_token = 0x05060708; + xmit_mock.xmit_count = 0x090A0B0C; + xmit_mock.op = 0x0D; + + /* Test uint8_t encode/decode + */ + do { + unsigned char buf[8]; + unsigned char expected[8]; + const uint8_t v = 200; + unsigned char out = 0; + + /* Start of buffer uint8_t + */ + mybzero(buf, 8); + mybzero(expected, 8); + expected[0] = 200; + out = 0; + if (H5FD__subfiling_xmit_encode_uint8(buf, v) != 1) { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, 8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, 8); + TEST_ERROR; + } + if (H5FD__subfiling_xmit_decode_uint8(&out, buf) != 1) { + TEST_ERROR; + } + if (v != out) { + TEST_ERROR; + } + + /* Middle of buffer uint8_t + */ + mybzero(buf, 8); + mybzero(expected, 8); + expected[3] = v; + out = 0; + if (H5FD__subfiling_xmit_encode_uint8((buf+3), v) != 1) { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, 8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, 8); + TEST_ERROR; + } + if (H5FD__subfiling_xmit_decode_uint8(&out, (buf+3)) != 1) { + TEST_ERROR; + } + if (v != out) { + TEST_ERROR; + } + + /* End of buffer uint8_t + */ + mybzero(buf, 8); + mybzero(expected, 8); + expected[7] = v; + out = 0; + if (H5FD__subfiling_xmit_encode_uint8((buf+7), v) != 1) { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, 8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, 8); + TEST_ERROR; + } + if (H5FD__subfiling_xmit_decode_uint8(&out, (buf+7)) != 1) { + TEST_ERROR; + } + if (v != out) { + TEST_ERROR; + } + + } while (0); /* end uint8_t en/decode */ + + /* Test uint16_t encode/decode + */ + do { + unsigned char buf[8]; + unsigned char expected[8]; + const uint16_t v = 0x8F02; + uint16_t out = 0; + + /* Start of buffer uint16_t + */ + mybzero(buf, 8); + mybzero(expected, 8); + expected[0] = 0x8F; + expected[1] = 0x02; + out = 0; + if (H5FD__subfiling_xmit_encode_uint16(buf, v) != 2) { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, 8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, 8); + TEST_ERROR; + } + if (H5FD__subfiling_xmit_decode_uint16(&out, buf) != 2) { + TEST_ERROR; + } + if (out != v) { + TEST_ERROR; + } + + /* Middle of buffer uint16_t + */ + mybzero(buf, 8); + mybzero(expected, 8); + expected[3] = 0x8F; + expected[4] = 0x02; + out = 0; + if (H5FD__subfiling_xmit_encode_uint16((buf+3), v) != 2) { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, 8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, 8); + TEST_ERROR; + } + if (H5FD__subfiling_xmit_decode_uint16(&out, (buf+3)) != 2) { + TEST_ERROR; + } + if (out != v) { + TEST_ERROR; + } + /* slice */ + if (H5FD__subfiling_xmit_decode_uint16(&out, (buf+4)) != 2) { + TEST_ERROR; + } + if (out != 0x0200) { + TEST_ERROR; + } + + /* End of buffer uint16_t + */ + mybzero(buf, 8); + mybzero(expected, 8); + expected[6] = 0x8F; + expected[7] = 0x02; + out = 0; + if (H5FD__subfiling_xmit_encode_uint16((buf+6), v) != 2) { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, 8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, 8); + TEST_ERROR; + } + if (H5FD__subfiling_xmit_decode_uint16(&out, (buf+6)) != 2) { + TEST_ERROR; + } + if (out != v) { + TEST_ERROR; + } + + } while (0); /* end uint16_t en/decode */ + + /* Test uint32_t encode/decode + */ + do { + unsigned char buf[8]; + unsigned char expected[8]; + const uint32_t v = 0x8F020048; + uint32_t out = 0; + + /* Start of buffer uint32_t + */ + mybzero(buf, 8); + mybzero(expected, 8); + expected[0] = 0x8F; + expected[1] = 0x02; + expected[2] = 0x00; + expected[3] = 0x48; + out = 0; + if (H5FD__subfiling_xmit_encode_uint32(buf, v) != 4) { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, 8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, 8); + TEST_ERROR; + } + if (H5FD__subfiling_xmit_decode_uint32(&out, buf) != 4) { + TEST_ERROR; + } + if (out != v) { + TEST_ERROR; + } + + /* Middle of buffer uint32_t + */ + mybzero(buf, 8); + mybzero(expected, 8); + expected[3] = 0x8F; + expected[4] = 0x02; + expected[5] = 0x00; + expected[6] = 0x48; + out = 0; + if (H5FD__subfiling_xmit_encode_uint32((buf+3), v) != 4) { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, 8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, 8); + TEST_ERROR; + } + if (H5FD__subfiling_xmit_decode_uint32(&out, (buf+3)) != 4) { + TEST_ERROR; + } + if (out != v) { + TEST_ERROR; + } + /* slice */ + if (H5FD__subfiling_xmit_decode_uint32(&out, (buf+4)) != 4) { + TEST_ERROR; + } + if (out != 0x02004800) { + TEST_ERROR; + } + + /* End of buffer uint32_t + */ + mybzero(buf, 8); + mybzero(expected, 8); + expected[4] = 0x8F; + expected[5] = 0x02; + expected[6] = 0x00; + expected[7] = 0x48; + out = 0; + if (H5FD__subfiling_xmit_encode_uint32((buf+4), v) != 4) { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, 8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, 8); + TEST_ERROR; + } + if (H5FD__subfiling_xmit_decode_uint32(&out, (buf+4)) != 4) { + TEST_ERROR; + } + if (out != v) { + TEST_ERROR; + } + + } while (0); /* end uint32_t en/decode */ + + /* Test uint64_t encode/decode + */ + do { + unsigned char buf[16]; + unsigned char expected[16]; + const uint64_t v = 0x90DCBE17939CE4BB; + uint64_t out = 0; + + /* Start of buffer uint64_t + */ + mybzero(buf, 16); + mybzero(expected, 16); + expected[0] = 0x90; + expected[1] = 0xDC; + expected[2] = 0xBE; + expected[3] = 0x17; + expected[4] = 0x93; + expected[5] = 0x9C; + expected[6] = 0xE4; + expected[7] = 0xBB; + out = 0; + if (H5FD__subfiling_xmit_encode_uint64(buf, v) != 8) { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, 16) != 0) { + PRINT_BUFFER_DIFF(buf, expected, 16); + TEST_ERROR; + } + if (H5FD__subfiling_xmit_decode_uint64(&out, buf) != 8) { + TEST_ERROR; + } + if (out != v) { + TEST_ERROR; + } + + /* Middle of buffer uint64_t + */ + mybzero(buf, 16); + mybzero(expected, 16); + expected[3] = 0x90; + expected[4] = 0xDC; + expected[5] = 0xBE; + expected[6] = 0x17; + expected[7] = 0x93; + expected[8] = 0x9C; + expected[9] = 0xE4; + expected[10] = 0xBB; + out = 0; + if (H5FD__subfiling_xmit_encode_uint64((buf+3), v) != 8) { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, 16) != 0) { + PRINT_BUFFER_DIFF(buf, expected, 16); + TEST_ERROR; + } + if (H5FD__subfiling_xmit_decode_uint64(&out, (buf+3)) != 8) { + TEST_ERROR; + } + if (out != v) { + TEST_ERROR; + } + /* slice */ + if (H5FD__subfiling_xmit_decode_uint64(&out, (buf+6)) != 8) { + TEST_ERROR; + } + if (out != 0x17939CE4BB000000) { + TEST_ERROR; + } + + /* End of buffer uint64_t + */ + mybzero(buf, 16); + mybzero(expected, 16); + expected[8] = 0x90; + expected[9] = 0xDC; + expected[10] = 0xBE; + expected[11] = 0x17; + expected[12] = 0x93; + expected[13] = 0x9C; + expected[14] = 0xE4; + expected[15] = 0xBB; + out = 0; + if (H5FD__subfiling_xmit_encode_uint64((buf+8), v) != 8) { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, 16) != 0) { + PRINT_BUFFER_DIFF(buf, expected, 16); + TEST_ERROR; + } + if (H5FD__subfiling_xmit_decode_uint64(&out, (buf+8)) != 8) { + TEST_ERROR; + } + if (out != v) { + TEST_ERROR; + } + + } while (0); /* end uint64_t en/decode */ + + /* Test xmit header structure encode/decode + * Write bogus but easily verifiable data to inside a buffer, and compare. + * Then decode the buffer and compare the structure contents. + * Then repeat from a different offset in the buffer and compare. + */ + do { + unsigned char buf[H5FD_SUBFILING_XMIT_HEADER_SIZE+8]; + unsigned char expected[H5FD_SUBFILING_XMIT_HEADER_SIZE+8]; + H5FD_subfiling_xmit_t xmit_out; + size_t i = 0; + + /* sanity check */ + if (14 != H5FD_SUBFILING_XMIT_HEADER_SIZE) { + FAIL_PUTS_ERROR("Header size definition does not match test\n"); + } + + /* Populate the expected buffer; expect end padding of 0xFF + */ + HDmemset(expected, 0xFF, H5FD_SUBFILING_XMIT_HEADER_SIZE+8); + for (i=0; i < H5FD_SUBFILING_XMIT_HEADER_SIZE; i++) { + expected[i+2] = (unsigned char)i; + } + + /* Encode, and compare buffer contents + * Initial buffer is filled with 0xFF to match expected padding + */ + HDmemset(buf, 0xFF, H5FD_SUBFILING_XMIT_HEADER_SIZE+8); + if (H5FD_subfiling_xmit_encode_header((buf+2), &xmit_mock) + != H5FD_SUBFILING_XMIT_HEADER_SIZE) + { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, H5FD_SUBFILING_XMIT_HEADER_SIZE+8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, H5FD_SUBFILING_XMIT_HEADER_SIZE+8); + TEST_ERROR; + } + + /* Decode from buffer + */ + if (H5FD_subfiling_xmit_decode_header(&xmit_out, (buf+2)) + != H5FD_SUBFILING_XMIT_HEADER_SIZE) + { + TEST_ERROR; + } + if (xmit_out.magic != xmit_mock.magic) TEST_ERROR; + if (xmit_out.version != xmit_mock.version) TEST_ERROR; + if (xmit_out.session_token != xmit_mock.session_token) TEST_ERROR; + if (xmit_out.xmit_count != xmit_mock.xmit_count) TEST_ERROR; + if (xmit_out.op != xmit_mock.op) TEST_ERROR; + + /* Decode from different offset in buffer + * Observe changes when ingesting the padding + */ + if (H5FD_subfiling_xmit_decode_header(&xmit_out, (buf)) + != H5FD_SUBFILING_XMIT_HEADER_SIZE) + { + TEST_ERROR; + } + if (xmit_out.magic != 0xFFFF0001) TEST_ERROR; + if (xmit_out.version != 0x02) TEST_ERROR; + if (xmit_out.session_token != 0x03040506) TEST_ERROR; + if (xmit_out.xmit_count != 0x0708090A) TEST_ERROR; + if (xmit_out.op != 0x0B) TEST_ERROR; + + } while (0); /* end xmit header en/decode */ + + /* Test xmit set-eoa structure encode/decode + * Write bogus but easily verifiable data to inside a buffer, and compare. + * Then decode the buffer and compare the structure contents. + * Then repeat from a different offset in the buffer and compare. + */ + do { + unsigned char buf[H5FD_SUBFILING_XMIT_EOA_SIZE+8]; + unsigned char expected[H5FD_SUBFILING_XMIT_EOA_SIZE+8]; + H5FD_subfiling_xmit_eoa_t xmit_in; + H5FD_subfiling_xmit_eoa_t xmit_out; + size_t i = 0; + + /* sanity check */ + if ((14+9) != H5FD_SUBFILING_XMIT_EOA_SIZE) { + FAIL_PUTS_ERROR("Header size definition does not match test\n"); + } + if (xmit_mock.op != 0x0D) { + FAIL_PUTS_ERROR("shared header structure is not in expected state"); + } + + /* Populate the expected buffer; expect end padding of 0xFF + */ + HDmemset(expected, 0xFF, H5FD_SUBFILING_XMIT_EOA_SIZE+8); + for (i=0; i < H5FD_SUBFILING_XMIT_EOA_SIZE; i++) { + expected[i+2] = (unsigned char)i; + } + + /* Set xmit_in + */ + xmit_in.pub = xmit_mock; /* shared/common */ + xmit_in.type = 0x0E; + xmit_in.eoa_addr = 0x0F10111213141516; + + /* Encode, and compare buffer contents + * Initial buffer is filled with 0xFF to match expected padding + */ + HDmemset(buf, 0xFF, H5FD_SUBFILING_XMIT_EOA_SIZE+8); + if (H5FD_subfiling_xmit_encode_set_eoa((buf+2), &xmit_in) + != H5FD_SUBFILING_XMIT_EOA_SIZE) + { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, H5FD_SUBFILING_XMIT_EOA_SIZE+8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, H5FD_SUBFILING_XMIT_EOA_SIZE+8); + TEST_ERROR; + } + + /* Decode from buffer + */ + if (H5FD_subfiling_xmit_decode_set_eoa(&xmit_out, (buf+2)) + != H5FD_SUBFILING_XMIT_EOA_SIZE) + { + TEST_ERROR; + } + if (xmit_out.pub.magic != xmit_mock.magic) TEST_ERROR; + if (xmit_out.pub.version != xmit_mock.version) TEST_ERROR; + if (xmit_out.pub.session_token != xmit_mock.session_token) TEST_ERROR; + if (xmit_out.pub.xmit_count != xmit_mock.xmit_count) TEST_ERROR; + if (xmit_out.pub.op != xmit_mock.op) TEST_ERROR; + if (xmit_out.type != 0x0E) TEST_ERROR; + if (xmit_out.eoa_addr != 0x0F10111213141516) TEST_ERROR; + + /* Decode from different offset in buffer + * Observe changes when ingesting the padding + */ + if (H5FD_subfiling_xmit_decode_set_eoa(&xmit_out, (buf)) + != H5FD_SUBFILING_XMIT_EOA_SIZE) + { + TEST_ERROR; + } + if (xmit_out.pub.magic != 0xFFFF0001) TEST_ERROR; + if (xmit_out.pub.version != 0x02) TEST_ERROR; + if (xmit_out.pub.session_token != 0x03040506) TEST_ERROR; + if (xmit_out.pub.xmit_count != 0x0708090A) TEST_ERROR; + if (xmit_out.pub.op != 0x0B) TEST_ERROR; + if (xmit_out.type != 0x0C) TEST_ERROR; + if (xmit_out.eoa_addr != 0x0D0E0F1011121314) TEST_ERROR; + + } while (0); /* end xmit set-eoa en/decode */ + + /* Test xmit lock structure encode/decode + * Write bogus but easily verifiable data to inside a buffer, and compare. + * Then decode the buffer and compare the structure contents. + * Then repeat from a different offset in the buffer and compare. + */ + do { + unsigned char buf[H5FD_SUBFILING_XMIT_LOCK_SIZE+8]; + unsigned char expected[H5FD_SUBFILING_XMIT_LOCK_SIZE+8]; + H5FD_subfiling_xmit_lock_t xmit_in; + H5FD_subfiling_xmit_lock_t xmit_out; + size_t i = 0; + + /* sanity check */ + if ((14+8) != H5FD_SUBFILING_XMIT_LOCK_SIZE) { + FAIL_PUTS_ERROR("Header size definition does not match test\n"); + } + if (xmit_mock.op != 0x0D) { + FAIL_PUTS_ERROR("shared header structure is not in expected state"); + } + + /* Populate the expected buffer; expect end padding of 0xFF + */ + HDmemset(expected, 0xFF, H5FD_SUBFILING_XMIT_LOCK_SIZE+8); + for (i=0; i < H5FD_SUBFILING_XMIT_LOCK_SIZE; i++) { + expected[i+2] = (unsigned char)i; + } + + /* Set xmit_in + */ + xmit_in.pub = xmit_mock; /* shared/common */ + xmit_in.rw = 0x0E0F101112131415; + + /* Encode, and compare buffer contents + * Initial buffer is filled with 0xFF to match expected padding + */ + HDmemset(buf, 0xFF, H5FD_SUBFILING_XMIT_LOCK_SIZE+8); + if (H5FD_subfiling_xmit_encode_lock((buf+2), &xmit_in) + != H5FD_SUBFILING_XMIT_LOCK_SIZE) + { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, H5FD_SUBFILING_XMIT_LOCK_SIZE+8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, H5FD_SUBFILING_XMIT_LOCK_SIZE+8); + TEST_ERROR; + } + + /* Decode from buffer + */ + if (H5FD_subfiling_xmit_decode_lock(&xmit_out, (buf+2)) + != H5FD_SUBFILING_XMIT_LOCK_SIZE) + { + TEST_ERROR; + } + if (xmit_out.pub.magic != xmit_mock.magic) TEST_ERROR; + if (xmit_out.pub.version != xmit_mock.version) TEST_ERROR; + if (xmit_out.pub.session_token != xmit_mock.session_token) TEST_ERROR; + if (xmit_out.pub.xmit_count != xmit_mock.xmit_count) TEST_ERROR; + if (xmit_out.pub.op != xmit_mock.op) TEST_ERROR; + if (xmit_out.rw != 0x0E0F101112131415) TEST_ERROR; + + /* Decode from different offset in buffer + * Observe changes when ingesting the padding + */ + if (H5FD_subfiling_xmit_decode_lock(&xmit_out, (buf)) + != H5FD_SUBFILING_XMIT_LOCK_SIZE) + { + TEST_ERROR; + } + if (xmit_out.pub.magic != 0xFFFF0001) TEST_ERROR; + if (xmit_out.pub.version != 0x02) TEST_ERROR; + if (xmit_out.pub.session_token != 0x03040506) TEST_ERROR; + if (xmit_out.pub.xmit_count != 0x0708090A) TEST_ERROR; + if (xmit_out.pub.op != 0x0B) TEST_ERROR; + if (xmit_out.rw != 0x0C0D0E0F10111213) TEST_ERROR; + + } while (0); /* end xmit lock en/decode */ + + /* Test xmit open structure encode/decode + * Write bogus but easily verifiable data to inside a buffer, and compare. + * Then decode the buffer and compare the structure contents. + * Then repeat from a different offset in the buffer and compare. + * + * Verifies that the first zero character in the filepath will end the + * string, with all following bytes in the encoded buffer being zeroed. + */ + do { + unsigned char buf[H5FD_SUBFILING_XMIT_OPEN_SIZE+8]; + unsigned char expected[H5FD_SUBFILING_XMIT_OPEN_SIZE+8]; + H5FD_subfiling_xmit_open_t xmit_in; + H5FD_subfiling_xmit_open_t xmit_out; + size_t i = 0; + + /* sanity check */ + if ((14+20+4097) != H5FD_SUBFILING_XMIT_OPEN_SIZE) { + FAIL_PUTS_ERROR("Header size definition does not match test\n"); + } + if (xmit_mock.op != 0x0D) { + FAIL_PUTS_ERROR("shared header structure is not in expected state"); + } + + /* Populate the expected buffer; expect end padding of 0xFF + */ + HDmemset(expected, 0xFF, H5FD_SUBFILING_XMIT_OPEN_SIZE+8); + for (i=0; i < H5FD_SUBFILING_XMIT_OPEN_SIZE; i++) { + /* 0x100 is "zero" in a byte, so encode will treat it as a NULL- + * terminator in the filepath string. Expect all zeroes following. + */ + expected[i+2] = (i > 0xFF) ? 0 : (unsigned char)i; + } + + /* Set xmit_in + */ + xmit_in.pub = xmit_mock; /* shared/common */ + xmit_in.flags = 0x0E0F1011; + xmit_in.maxaddr = 0x1213141516171819; + xmit_in.size_t_blob = 0x1A1B1C1D1E1F2021; + for (i=0x22; i < H5FD_SUBFILING_XMIT_FILEPATH_MAX+0x22; i++) { + /* nonzero values repeat after 0x100, but will not be encoded */ + xmit_in.filename[i-0x22] = (char)(i % 0x100); + } + xmit_in.filename[H5FD_SUBFILING_XMIT_FILEPATH_MAX-1] = 0; + + /* Encode, and compare buffer contents + * Initial buffer is filled with 0xFF to match expected padding + */ + HDmemset(buf, 0xFF, H5FD_SUBFILING_XMIT_OPEN_SIZE+8); + if (H5FD_subfiling_xmit_encode_open((buf+2), &xmit_in) + != H5FD_SUBFILING_XMIT_OPEN_SIZE) + { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, H5FD_SUBFILING_XMIT_OPEN_SIZE+8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, H5FD_SUBFILING_XMIT_OPEN_SIZE+8); + TEST_ERROR; + } + + /* Decode from buffer + */ + if (H5FD_subfiling_xmit_decode_open(&xmit_out, (buf+2)) + != H5FD_SUBFILING_XMIT_OPEN_SIZE) + { + TEST_ERROR; + } + if (xmit_out.pub.magic != xmit_mock.magic) TEST_ERROR; + if (xmit_out.pub.version != xmit_mock.version) TEST_ERROR; + if (xmit_out.pub.session_token != xmit_mock.session_token) TEST_ERROR; + if (xmit_out.pub.xmit_count != xmit_mock.xmit_count) TEST_ERROR; + if (xmit_out.pub.op != xmit_mock.op) TEST_ERROR; + if (xmit_out.flags != xmit_in.flags) TEST_ERROR; + if (xmit_out.maxaddr != xmit_in.maxaddr) TEST_ERROR; + if (xmit_out.size_t_blob != xmit_in.size_t_blob) TEST_ERROR; + if (HDstrncmp(xmit_out.filename, xmit_in.filename, + H5FD_SUBFILING_XMIT_FILEPATH_MAX) + != 0) + { + PRINT_BUFFER_DIFF(xmit_out.filename, xmit_in.filename, + H5FD_SUBFILING_XMIT_FILEPATH_MAX); + TEST_ERROR; + } + + /* Decode from different offset in buffer + * Observe changes when ingesting the padding + */ + if (H5FD_subfiling_xmit_decode_open(&xmit_out, (buf)) + != H5FD_SUBFILING_XMIT_OPEN_SIZE) + { + TEST_ERROR; + } + if (xmit_out.pub.magic != 0xFFFF0001) TEST_ERROR; + if (xmit_out.pub.version != 0x02) TEST_ERROR; + if (xmit_out.pub.session_token != 0x03040506) TEST_ERROR; + if (xmit_out.pub.xmit_count != 0x0708090A) TEST_ERROR; + if (xmit_out.pub.op != 0x0B) TEST_ERROR; + if (xmit_out.flags != 0x0C0D0E0F) TEST_ERROR; + if (xmit_out.maxaddr != 0x1011121314151617) TEST_ERROR; + if (xmit_out.size_t_blob != 0x18191A1B1C1D1E1F) TEST_ERROR; + /* update expected "filepath" in structure */ + for (i=0x20; i < H5FD_SUBFILING_XMIT_FILEPATH_MAX+0x20; i++) { + xmit_in.filename[i-0x20] = (i > 0xFF) ? 0 : (char)i; + } + if (HDstrncmp(xmit_out.filename, xmit_in.filename, + H5FD_SUBFILING_XMIT_FILEPATH_MAX) + != 0) + { + PRINT_BUFFER_DIFF(xmit_out.filename, xmit_in.filename, + H5FD_SUBFILING_XMIT_FILEPATH_MAX); + TEST_ERROR; + } + + } while (0); /* end xmit open en/decode */ + + /* Test xmit reply structure encode/decode + * Write bogus but easily verifiable data to inside a buffer, and compare. + * Then decode the buffer and compare the structure contents. + * Then repeat from a different offset in the buffer and compare. + * + * Verifies that the first zero character in the filepath will end the + * string, with all following bytes in the encoded buffer being zeroed. + */ + do { + unsigned char buf[H5FD_SUBFILING_XMIT_REPLY_SIZE+8]; + unsigned char expected[H5FD_SUBFILING_XMIT_REPLY_SIZE+8]; + H5FD_subfiling_xmit_reply_t xmit_in; + H5FD_subfiling_xmit_reply_t xmit_out; + size_t i = 0; + + /* sanity check */ + if ((14+4+256) != H5FD_SUBFILING_XMIT_REPLY_SIZE) { + FAIL_PUTS_ERROR("Header size definition does not match test\n"); + } + if (xmit_mock.op != 0x0D) { + FAIL_PUTS_ERROR("shared header structure is not in expected state"); + } + + /* Populate the expected buffer; expect end padding of 0xFF + */ + HDmemset(expected, 0xFF, H5FD_SUBFILING_XMIT_REPLY_SIZE+8); + for (i=0; i < H5FD_SUBFILING_XMIT_REPLY_SIZE; i++) { + /* 0x100 is "zero" in a byte, so encode will treat it as a NULL- + * terminator in the filepath string. Expect all zeroes following. + */ + expected[i+2] = (i > 0xFF) ? 0 : (unsigned char)i; + } + + /* Set xmit_in + */ + xmit_in.pub = xmit_mock; /* shared/common */ + xmit_in.status = 0x0E0F1011; + for (i=0x12; i < H5FD_SUBFILING_STATUS_MESSAGE_MAX+0x12; i++) { + /* nonzero values repeat after 0x100, but will not be encoded */ + xmit_in.message[i-0x12] = (char)(i % 0x100); + } + xmit_in.message[H5FD_SUBFILING_STATUS_MESSAGE_MAX-1] = 0; + + /* Encode, and compare buffer contents + * Initial buffer is filled with 0xFF to match expected padding + */ + HDmemset(buf, 0xFF, H5FD_SUBFILING_XMIT_REPLY_SIZE+8); + if (H5FD_subfiling_xmit_encode_reply((buf+2), &xmit_in) + != H5FD_SUBFILING_XMIT_REPLY_SIZE) + { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, H5FD_SUBFILING_XMIT_REPLY_SIZE+8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, H5FD_SUBFILING_XMIT_REPLY_SIZE+8); + TEST_ERROR; + } + + /* Decode from buffer + */ + if (H5FD_subfiling_xmit_decode_reply(&xmit_out, (buf+2)) + != H5FD_SUBFILING_XMIT_REPLY_SIZE) + { + TEST_ERROR; + } + if (xmit_out.pub.magic != xmit_mock.magic) TEST_ERROR; + if (xmit_out.pub.version != xmit_mock.version) TEST_ERROR; + if (xmit_out.pub.session_token != xmit_mock.session_token) TEST_ERROR; + if (xmit_out.pub.xmit_count != xmit_mock.xmit_count) TEST_ERROR; + if (xmit_out.pub.op != xmit_mock.op) TEST_ERROR; + if (xmit_out.status != xmit_in.status) TEST_ERROR; + if (HDstrncmp(xmit_out.message, xmit_in.message, + H5FD_SUBFILING_STATUS_MESSAGE_MAX) + != 0) + { + PRINT_BUFFER_DIFF(xmit_out.message, xmit_in.message, + H5FD_SUBFILING_STATUS_MESSAGE_MAX); + TEST_ERROR; + } + + /* Decode from different offset in buffer + * Observe changes when ingesting the padding + */ + if (H5FD_subfiling_xmit_decode_reply(&xmit_out, (buf)) + != H5FD_SUBFILING_XMIT_REPLY_SIZE) + { + TEST_ERROR; + } + if (xmit_out.pub.magic != 0xFFFF0001) TEST_ERROR; + if (xmit_out.pub.version != 0x02) TEST_ERROR; + if (xmit_out.pub.session_token != 0x03040506) TEST_ERROR; + if (xmit_out.pub.xmit_count != 0x0708090A) TEST_ERROR; + if (xmit_out.pub.op != 0x0B) TEST_ERROR; + if (xmit_out.status != 0x0C0D0E0F) TEST_ERROR; + /* update expected "message" in structure */ + for (i=0x10; i < H5FD_SUBFILING_STATUS_MESSAGE_MAX+0x10; i++) { + xmit_in.message[i-0x10] = (i > 0xFF) ? 0 : (char)i; + } + if (HDstrncmp(xmit_out.message, xmit_in.message, + H5FD_SUBFILING_STATUS_MESSAGE_MAX) + != 0) + { + PRINT_BUFFER_DIFF(xmit_out.message, xmit_in.message, + H5FD_SUBFILING_STATUS_MESSAGE_MAX); + TEST_ERROR; + } + + } while (0); /* end xmit reply en/decode */ + + /* Test xmit write structure encode/decode + * Write bogus but easily verifiable data to inside a buffer, and compare. + * Then decode the buffer and compare the structure contents. + * Then repeat from a different offset in the buffer and compare. + */ + do { + unsigned char buf[H5FD_SUBFILING_XMIT_WRITE_SIZE+8]; + unsigned char expected[H5FD_SUBFILING_XMIT_WRITE_SIZE+8]; + H5FD_subfiling_xmit_write_t xmit_in; + H5FD_subfiling_xmit_write_t xmit_out; + size_t i = 0; + + /* sanity check */ + if ((14+17) != H5FD_SUBFILING_XMIT_WRITE_SIZE) { + FAIL_PUTS_ERROR("Header size definition does not match test\n"); + } + if (xmit_mock.op != 0x0D) { + FAIL_PUTS_ERROR("shared header structure is not in expected state"); + } + + /* Populate the expected buffer; expect end padding of 0xFF + */ + HDmemset(expected, 0xFF, H5FD_SUBFILING_XMIT_WRITE_SIZE+8); + for (i=0; i < H5FD_SUBFILING_XMIT_WRITE_SIZE; i++) { + expected[i+2] = (unsigned char)i; + } + + /* Set xmit_in + */ + xmit_in.pub = xmit_mock; /* shared/common */ + xmit_in.type = 0x0E; + xmit_in.offset = 0x0F10111213141516; + xmit_in.size = 0x1718191A1B1C1D1E; + + /* Encode, and compare buffer contents + * Initial buffer is filled with 0xFF to match expected padding + */ + HDmemset(buf, 0xFF, H5FD_SUBFILING_XMIT_WRITE_SIZE+8); + if (H5FD_subfiling_xmit_encode_write((buf+2), &xmit_in) + != H5FD_SUBFILING_XMIT_WRITE_SIZE) + { + TEST_ERROR; + } + if (HDmemcmp(buf, expected, H5FD_SUBFILING_XMIT_WRITE_SIZE+8) != 0) { + PRINT_BUFFER_DIFF(buf, expected, H5FD_SUBFILING_XMIT_WRITE_SIZE+8); + TEST_ERROR; + } + + /* Decode from buffer + */ + if (H5FD_subfiling_xmit_decode_write(&xmit_out, (buf+2)) + != H5FD_SUBFILING_XMIT_WRITE_SIZE) + { + TEST_ERROR; + } + if (xmit_out.pub.magic != xmit_mock.magic) TEST_ERROR; + if (xmit_out.pub.version != xmit_mock.version) TEST_ERROR; + if (xmit_out.pub.session_token != xmit_mock.session_token) TEST_ERROR; + if (xmit_out.pub.xmit_count != xmit_mock.xmit_count) TEST_ERROR; + if (xmit_out.pub.op != xmit_mock.op) TEST_ERROR; + if (xmit_out.type != 0x0E) TEST_ERROR; + if (xmit_out.offset != 0x0F10111213141516) TEST_ERROR; + if (xmit_out.size != 0x1718191A1B1C1D1E) TEST_ERROR; + + /* Decode from different offset in buffer + * Observe changes when ingesting the padding + */ + if (H5FD_subfiling_xmit_decode_write(&xmit_out, (buf)) + != H5FD_SUBFILING_XMIT_WRITE_SIZE) + { + TEST_ERROR; + } + if (xmit_out.pub.magic != 0xFFFF0001) TEST_ERROR; + if (xmit_out.pub.version != 0x02) TEST_ERROR; + if (xmit_out.pub.session_token != 0x03040506) TEST_ERROR; + if (xmit_out.pub.xmit_count != 0x0708090A) TEST_ERROR; + if (xmit_out.pub.op != 0x0B) TEST_ERROR; + if (xmit_out.type != 0x0C) TEST_ERROR; + if (xmit_out.offset != 0x0D0E0F1011121314) TEST_ERROR; + if (xmit_out.size != 0x15161718191A1B1C) TEST_ERROR; + + } while (0); /* end xmit write en/decode */ + + PASSED(); + return 0; + +error: + return -1; +} /* end test_xmit_encode_decode */ + +#endif + +/* --------------------------------------------------------------------------- + * Function: create_subfiling_ioc_fapl + * + * Purpose: Create and populate a subfiling FAPL ID. + * Creates target files with the given base name -- ideally the + * test name -- and creates subfilinging/split FAPL set to use the + * global subfiling info and a sec2 R/W channel driver. + * + * TODO: receive target IP from caller? + * + * Return: Success: HID of the top-level (subfiling) FAPL, a non-negative + * value. + * Failure: H5I_INVALID_HID, a negative value. + * + * Programmer: Jacob Smith + * 2019 + * --------------------------------------------------------------------------- + */ +static hid_t +create_subfiling_ioc_fapl(const char *_basename, struct subfilingtest_filenames *names) +{ + hid_t ret_value = H5I_INVALID_HID; + hid_t ioc_fapl = H5I_INVALID_HID; + H5FD_ioc_config_t ioc_config = { + 0, + }; + H5FD_subfiling_config_t subfiling_conf = { + 0, + }; + + if (_basename == NULL || *_basename == '\0') { + TEST_ERROR; + } + + ioc_fapl = H5Pcreate(H5P_FILE_ACCESS); + if (H5I_INVALID_HID == ioc_fapl) { + TEST_ERROR; + } + + /* Prepare the subfiling fapl */ + ret_value = H5Pcreate(H5P_FILE_ACCESS); + if (H5I_INVALID_HID == ret_value) { + TEST_ERROR; + } + /* Get subfiling VFD defaults */ + if (H5Pget_fapl_subfiling(ret_value, &subfiling_conf) == FAIL) { + TEST_ERROR; + } + if (subfiling_conf.require_ioc) { + /* Get IOC VFD defaults */ + if (H5Pget_fapl_ioc(ioc_fapl, &ioc_config) == FAIL) { + TEST_ERROR; + } + /* Now we can set the IOC fapl. */ + if (H5Pset_fapl_ioc(ioc_fapl, &ioc_config) == FAIL) { + TEST_ERROR; + } + } + else { + if (H5Pset_fapl_sec2(ioc_fapl) == FAIL) { + TEST_ERROR; + } + } + + /* Assign the IOC fapl as the underlying VPD */ + subfiling_conf.common.ioc_fapl_id = ioc_fapl; + + /* Fill the file paths for the current file create/open */ + if (build_paths(_basename, &subfiling_conf, names) < 0) { + TEST_ERROR; + } + + /* Now we can set the SUBFILING fapl before returning. */ + if (H5Pset_fapl_subfiling(ret_value, &subfiling_conf) == FAIL) { + TEST_ERROR; + } + + return ret_value; + +error: + if (H5I_INVALID_HID != ioc_fapl) + (void)H5Pclose(ioc_fapl); + if (H5I_INVALID_HID != ret_value) + (void)H5Pclose(ret_value); + + return H5I_INVALID_HID; +} /* end create_subfiling_ioc_fapl() */ + +/* --------------------------------------------------------------------------- + * Function: test_create_and_close + * + * Purpose: Test/demonstrate a do-nothing file open and close. + * + * Verifying file existence and contents is part of other tests. + * + * TODO: receive target IP from caller? + * + * Return: Success: 0 + * Failure: -1 + * + * Programmer: Jacob Smith + * 2019-12-17 + * --------------------------------------------------------------------------- + */ +static int +test_create_and_close(void) +{ + struct subfilingtest_filenames names; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5P_DEFAULT; + MPI_Info info = MPI_INFO_NULL; + MPI_Comm comm = MPI_COMM_WORLD; + + TESTING("File creation and immediate close"); + + /* Create FAPL for [IO Concentrator|subfiling] + */ + fapl_id = create_subfiling_ioc_fapl("basic_create", &names); + if (H5I_INVALID_HID == fapl_id) { + TEST_ERROR; + } + + /* set the MPI communicator and info in the FAPL */ + if (H5Pset_mpi_params(fapl_id, comm, info) < 0) + TEST_ERROR; + + /* -------------------- */ + /* TEST: Create and Close */ + + file_id = H5Fcreate(names.rw, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + if (H5I_INVALID_HID == file_id) { + TEST_ERROR; + } + + /* -------------------- */ + /* Standard cleanup */ + + if (H5Fclose(file_id) == FAIL) { + TEST_ERROR; + } + + if (fapl_id != H5P_DEFAULT && fapl_id >= 0) { + if (H5Pclose(fapl_id) == FAIL) { + TEST_ERROR; + } + } + + PASSED(); + return 0; + +error: + H5E_BEGIN_TRY + { + (void)H5Fclose(file_id); + (void)H5Pclose(fapl_id); + } + H5E_END_TRY; + return -1; +} /* end test_create_and_close() */ + +/* ---------------------------------------------------------------------------- + * Function: create_datasets + * + * Purpose: Given a file ID and least and greateset dataset indices, create + * populated chunked datasets in the target file from min_dset to + * (and including) max_dset. + * Uses #defined constants to determine chunk and dataset sizes + * and values. + * + * Return: SUCCEED/FAIL + * + * Programmer: Jacob Smith + * 2019-08-14 + * ---------------------------------------------------------------------------- + */ +static herr_t +create_datasets(hid_t file_id, unsigned min_dset, unsigned max_dset) +{ + hid_t dataspace_ids[MAX_DSET_COUNT + 1]; + hid_t dataset_ids[MAX_DSET_COUNT + 1]; + hid_t filespace_ids[MAX_DSET_COUNT + 1]; + int data_chunk[CHUNK_DIM][CHUNK_DIM]; + unsigned int i, j, k, l, m; + hsize_t offset[2]; + hid_t memspace_id = H5I_INVALID_HID; + hsize_t a_size[2] = {CHUNK_DIM, CHUNK_DIM}; + hsize_t chunk_dims[2] = {CHUNK_DIM, CHUNK_DIM}; + hsize_t dset_dims[2] = {DSET_DIM, DSET_DIM}; + + HDassert(file_id >= 0); + HDassert(min_dset <= max_dset); + HDassert(max_dset <= MAX_DSET_COUNT); + + LOGPRINT(2, "create_dataset()\n"); + + /* --------------------------------- + * "Clear" ID arrays + */ + + for (i = 0; i < MAX_DSET_COUNT; i++) { + LOGPRINT(3, "clearing IDs [%d]\n", i); + dataspace_ids[i] = H5I_INVALID_HID; + dataset_ids[i] = H5I_INVALID_HID; + filespace_ids[i] = H5I_INVALID_HID; + } + + /* --------------------------------- + * Generate dataspace, dataset, and 'filespace' IDs + */ + + if (_create_chunking_ids(file_id, min_dset, max_dset, chunk_dims, dset_dims, dataspace_ids, filespace_ids, + dataset_ids, &memspace_id) == FAIL) { + TEST_ERROR; + } + + /* --------------------------------- + * Initialize (write) all datasets in a "round robin"... + * for a given chunk 'location', write chunk data to each dataset. + */ + + for (i = 0; i < DSET_DIM; i += CHUNK_DIM) { + LOGPRINT(3, "i: %d\n", i); + for (j = 0; j < DSET_DIM; j += CHUNK_DIM) { + LOGPRINT(3, " j: %d\n", j); + for (m = min_dset; m <= max_dset; m++) { + LOGPRINT(3, " m: %d\n", m); + for (k = 0; k < CHUNK_DIM; k++) { + for (l = 0; l < CHUNK_DIM; l++) { + data_chunk[k][l] = (int)((DSET_DIM * DSET_DIM * m) + (DSET_DIM * (i + k)) + j + l); + LOGPRINT(3, " data_chunk[%d][%d]: %d\n", k, l, data_chunk[k][l]); + } + } + + /* select on disk hyperslab */ + offset[0] = (hsize_t)i; + offset[1] = (hsize_t)j; + LOGPRINT(3, " H5Sselect_hyperslab()\n"); + if (H5Sselect_hyperslab(filespace_ids[m], H5S_SELECT_SET, offset, NULL, a_size, NULL) < 0) { + TEST_ERROR; + } + + LOGPRINT(3, " H5Dwrite()\n"); + if (H5Dwrite(dataset_ids[m], H5T_NATIVE_INT, memspace_id, filespace_ids[m], H5P_DEFAULT, + data_chunk) < 0) { + TEST_ERROR; + } + } + } + } + + /* --------------------------------- + * Read and verify data from datasets + */ + + if (_verify_datasets(min_dset, max_dset, filespace_ids, dataset_ids, memspace_id) == FAIL) { + TEST_ERROR; + } + + /* --------------------------------- + * Cleanup + */ + + if (_close_chunking_ids(min_dset, max_dset, dataspace_ids, filespace_ids, dataset_ids, &memspace_id) == + FAIL) { + TEST_ERROR; + } + + return SUCCEED; + +error: + (void)_close_chunking_ids(min_dset, max_dset, dataspace_ids, filespace_ids, dataset_ids, &memspace_id); + LOGPRINT(1, "create_datasets() FAILED\n"); + return FAIL; +} /* end create_datasets() */ + +/* ---------------------------------------------------------------------------- + * Function: _create_chunking_ids + * + * Purpose: Create new IDs to be used with the associated file. + * + * Return: SUCCEED/FAIL + * + * Programer: Jacob Smith + * 2019 + * ---------------------------------------------------------------------------- + */ +static herr_t +_create_chunking_ids(hid_t file_id, unsigned min_dset, unsigned max_dset, hsize_t *chunk_dims, + hsize_t *dset_dims, hid_t *dataspace_ids, hid_t *filespace_ids, hid_t *dataset_ids, + hid_t *memspace_id) +{ + char dset_name[DSET_NAME_LEN + 1]; + unsigned m = 0; + hid_t dcpl_id = H5I_INVALID_HID; + + LOGPRINT(2, "_create_chunking_ids()\n"); + + /* -------------------- + * Create chunking DCPL + */ + + dcpl_id = H5Pcreate(H5P_DATASET_CREATE); + if (dcpl_id < 0) { + TEST_ERROR; + } + if (H5Pset_chunk(dcpl_id, 2, chunk_dims) == FAIL) { + TEST_ERROR; + } + + /* -------------------- + * Create dataspace IDs + */ + + for (m = min_dset; m <= max_dset; m++) { + dataspace_ids[m] = H5Screate_simple(2, dset_dims, NULL); + if (dataspace_ids[m] < 0) { + HDsnprintf(mesg, MIRR_MESG_SIZE, "unable to create dataspace ID %d\n", m); + FAIL_PUTS_ERROR(mesg); + } + } + + /* -------------------- + * Create dataset IDs + */ + + for (m = min_dset; m <= max_dset; m++) { + if (HDsnprintf(dset_name, DSET_NAME_LEN, "/dset%03d", m) > DSET_NAME_LEN) { + HDsnprintf(mesg, MIRR_MESG_SIZE, "unable to compose dset name %d\n", m); + FAIL_PUTS_ERROR(mesg); + } + + dataset_ids[m] = + H5Dcreate(file_id, dset_name, H5T_STD_I32BE, dataspace_ids[m], H5P_DEFAULT, dcpl_id, H5P_DEFAULT); + if (dataset_ids[m] < 0) { + HDsnprintf(mesg, MIRR_MESG_SIZE, "unable to create dset ID %d\n", m); + FAIL_PUTS_ERROR(mesg); + } + } + + /* -------------------- + * Get file space IDs + */ + + for (m = min_dset; m <= max_dset; m++) { + filespace_ids[m] = H5Dget_space(dataset_ids[m]); + if (filespace_ids[m] < 0) { + HDsnprintf(mesg, MIRR_MESG_SIZE, "unable to create filespace ID %d\n", m); + FAIL_PUTS_ERROR(mesg); + } + } + + /* -------------------- + * Create mem space to be used to read and write chunks + */ + + *memspace_id = H5Screate_simple(2, chunk_dims, NULL); + if (*memspace_id < 0) { + TEST_ERROR; + } + + /* -------------------- + * Clean up the DCPL, even if there were errors before + */ + + if (dcpl_id != H5P_DEFAULT && dcpl_id != H5I_INVALID_HID) { + if (H5Pclose(dcpl_id) == FAIL) { + TEST_ERROR; + } + } + + return SUCCEED; + +error: + if (dcpl_id != H5P_DEFAULT && dcpl_id != H5I_INVALID_HID) { + (void)H5Pclose(dcpl_id); + } + LOGPRINT(1, "_create_chunking_ids() FAILED\n"); + return FAIL; +} /* end _create_chunking_ids() */ + +/* ---------------------------------------------------------------------------- + * Function: _open_chunking_ids + * + * Purpose: Open/access IDs from the given file. + * + * Return: SUCCEED/FAIL + * + * Programmer: Jacob Smith + * 2019 + * ---------------------------------------------------------------------------- + */ +static herr_t +_open_chunking_ids(hid_t file_id, unsigned min_dset, unsigned max_dset, hsize_t *chunk_dims, + hid_t *filespace_ids, hid_t *dataset_ids, hid_t *memspace_id) +{ + char dset_name[DSET_NAME_LEN + 1]; + unsigned m = 0; + + LOGPRINT(2, "_open_chunking_ids()\n"); + + /* -------------------- + * Open dataset IDs + */ + + for (m = min_dset; m <= max_dset; m++) { + if (HDsnprintf(dset_name, DSET_NAME_LEN, "/dset%03d", m) > DSET_NAME_LEN) { + HDsnprintf(mesg, MIRR_MESG_SIZE, "unable to compose dset name %d\n", m); + FAIL_PUTS_ERROR(mesg); + } + + dataset_ids[m] = H5Dopen2(file_id, dset_name, H5P_DEFAULT); + if (dataset_ids[m] < 0) { + HDsnprintf(mesg, MIRR_MESG_SIZE, "unable to open dset ID %d\n", m); + FAIL_PUTS_ERROR(mesg); + } + } + + /* -------------------- + * Open filespace IDs + */ + + for (m = min_dset; m <= max_dset; m++) { + filespace_ids[m] = H5Dget_space(dataset_ids[m]); + if (filespace_ids[m] < 0) { + HDsnprintf(mesg, MIRR_MESG_SIZE, "unable to get filespace ID %d\n", m); + FAIL_PUTS_ERROR(mesg); + } + } + + /* -------------------- + * Create mem space to be used to read and write chunks + */ + + *memspace_id = H5Screate_simple(2, chunk_dims, NULL); + if (*memspace_id < 0) { + TEST_ERROR; + } + + return SUCCEED; + +error: + LOGPRINT(1, "_open_chunking_ids() FAILED\n"); + return FAIL; +} /* end _open_chunking_ids() */ + +/* --------------------------------------------------------------------------- + * Function: _close_chunking_ids + * + * Purpose: Close IDs that were created or opened. + * Pass NULL into `dataspace_ids` when closing items opened with + * _open_chunking_ids(). (as opposed to created IDs) + * + * Return: SUCCEED/FAIL + * + * Programmer: Jacob Smith + * 2019 + * --------------------------------------------------------------------------- + */ +static herr_t +_close_chunking_ids(unsigned min_dset, unsigned max_dset, hid_t *dataspace_ids, hid_t *filespace_ids, + hid_t *dataset_ids, hid_t *memspace_id) +{ + unsigned m; + + LOGPRINT(2, "_close_chunking_ids()\n"); + + for (m = min_dset; m <= max_dset; m++) { + LOGPRINT(3, "closing ids[%d]\n", m); + if (dataspace_ids) { + if (H5Sclose(dataspace_ids[m]) < 0) { + HDsnprintf(mesg, MIRR_MESG_SIZE, "unable to close dataspace_id[%d]\n", m); + FAIL_PUTS_ERROR(mesg); + } + } + if (H5Dclose(dataset_ids[m]) < 0) { + HDsnprintf(mesg, MIRR_MESG_SIZE, "unable to close dataset_id[%d]\n", m); + FAIL_PUTS_ERROR(mesg); + } + if (H5Sclose(filespace_ids[m]) < 0) { + HDsnprintf(mesg, MIRR_MESG_SIZE, "unable to close filespace_id[%d]\n", m); + FAIL_PUTS_ERROR(mesg); + } + } + + if ((*memspace_id != H5I_INVALID_HID) && (H5Sclose(*memspace_id) < 0)) { + TEST_ERROR; + } + + return SUCCEED; + +error: + LOGPRINT(1, "_close_chunking_ids() FAILED\n"); + return FAIL; +} /* end _close_chunking_ids() */ + +/* --------------------------------------------------------------------------- + * Function: _verify_datasets + * + * Purpose: Check that each chunk's contents are as expected, as pertaining + * to create_datasets(). + * + * Return: SUCCEED/FAIL + * + * Programmer: Jacob Smith + * 2019 + * --------------------------------------------------------------------------- + */ +static herr_t +_verify_datasets(unsigned min_dset, unsigned max_dset, hid_t *filespace_ids, hid_t *dataset_ids, + hid_t memspace_id) +{ + unsigned i, j, k, l, m; + int data_chunk[CHUNK_DIM][CHUNK_DIM]; + hsize_t offset[2]; + hsize_t a_size[2] = {CHUNK_DIM, CHUNK_DIM}; + + LOGPRINT(2, "_verify_datasets()\n"); + + for (i = 0; i < DSET_DIM; i += CHUNK_DIM) { + LOGPRINT(3, "i: %d\n", i); + for (j = 0; j < DSET_DIM; j += CHUNK_DIM) { + LOGPRINT(3, " j: %d\n", j); + for (m = min_dset; m <= max_dset; m++) { + LOGPRINT(3, " m: %d\n", m); + + /* select on disk hyperslab */ + offset[0] = (hsize_t)i; + offset[1] = (hsize_t)j; + if (H5Sselect_hyperslab(filespace_ids[m], H5S_SELECT_SET, offset, NULL, a_size, NULL) < 0) { + TEST_ERROR; + } + + if (H5Dread(dataset_ids[m], H5T_NATIVE_INT, memspace_id, filespace_ids[m], H5P_DEFAULT, + data_chunk) < 0) { + HDsnprintf(mesg, MIRR_MESG_SIZE, " H5Dread() [%d][%d][%d]\n", i, j, m); + FAIL_PUTS_ERROR(mesg); + } + + for (k = 0; k < CHUNK_DIM; k++) { + for (l = 0; l < CHUNK_DIM; l++) { + if ((unsigned)data_chunk[k][l] != + ((DSET_DIM * DSET_DIM * m) + (DSET_DIM * (i + k)) + j + l)) { + HDsnprintf(mesg, MIRR_MESG_SIZE, " MISMATCH [%d][%d][%d][%d][%d]\n", i, j, m, + k, l); + FAIL_PUTS_ERROR(mesg); + } + } + } + } + } + } + + return SUCCEED; + +error: + LOGPRINT(1, "_verify_datasets() FAILED\n"); + return FAIL; +} /* end _verify_datasets() */ + +/* --------------------------------------------------------------------------- + * Function: verify_datasets + * + * Purpose: Inspect the datasets in the file created by create_datasets(). + * Wrapper for _verify_datasets() -- this function sets up and + * tears down accessor information. + * + * Return: SUCCEED/FAIL + * + * Programmer: Jacob Smith + * 2019 + * --------------------------------------------------------------------------- + */ +static herr_t +verify_datasets(hid_t file_id, unsigned min_dset, unsigned max_dset) +{ + hid_t dataset_ids[MAX_DSET_COUNT + 1]; + hid_t filespace_ids[MAX_DSET_COUNT + 1]; + unsigned i; + hid_t memspace_id = H5I_INVALID_HID; + hsize_t chunk_dims[2] = {CHUNK_DIM, CHUNK_DIM}; + + HDassert(file_id >= 0); + HDassert(min_dset <= max_dset); + HDassert(max_dset <= MAX_DSET_COUNT); + + LOGPRINT(2, "verify_datasets()\n"); + + /* --------------------------------- + * "Clear" ID arrays + */ + + for (i = 0; i < MAX_DSET_COUNT; i++) { + LOGPRINT(3, "clearing IDs [%d]\n", i); + dataset_ids[i] = H5I_INVALID_HID; + filespace_ids[i] = H5I_INVALID_HID; + } + + /* --------------------------------- + * Generate dataspace, dataset, and 'filespace' IDs + */ + + if (_open_chunking_ids(file_id, min_dset, max_dset, chunk_dims, filespace_ids, dataset_ids, + &memspace_id) == FAIL) { + TEST_ERROR; + } + + /* --------------------------------- + * Read and verify data from datasets + */ + + if (_verify_datasets(min_dset, max_dset, filespace_ids, dataset_ids, memspace_id) == FAIL) { + TEST_ERROR; + } + + /* --------------------------------- + * Cleanup + */ + + if (_close_chunking_ids(min_dset, max_dset, NULL, filespace_ids, dataset_ids, &memspace_id) == FAIL) { + TEST_ERROR; + } + + return SUCCEED; + +error: + LOGPRINT(1, "verify_datasets() FAILED\n"); + (void)_close_chunking_ids(min_dset, max_dset, NULL, filespace_ids, dataset_ids, &memspace_id); + return FAIL; + +} /* end verify_datasets() */ + +/* --------------------------------------------------------------------------- + * Function: test_basic_dataset_write + * + * Purpose: Create and close files; reopen files and write a dataset, + * close; compare files. + * + * Return: Success: 0 + * Failure: -1 + * + * Programmer: Jacob Smith + * 2019 + * Richard Warren - modified the original for subfiling testing + * 2021 + * --------------------------------------------------------------------------- + */ +static int +test_basic_dataset_write(void) +{ + struct subfilingtest_filenames names; + off_t f1size = 0; /* size of the files */ + int f1_fid = -1; + int f1int = 0; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5P_DEFAULT; + hid_t dset_id = H5I_INVALID_HID; + hid_t dspace_id = H5I_INVALID_HID; + hid_t dtype_id = H5T_NATIVE_INT; + MPI_Info info = MPI_INFO_NULL; + MPI_Comm comm = MPI_COMM_WORLD; + + hid_t file_dataspace; /* File dataspace ID */ + hid_t mem_dataspace; /* memory dataspace ID */ + + hsize_t block[2], dims[2], stride[2]; + hsize_t count[2] = {1, 1}; + hsize_t start[2] = {0, 0}; + + int *check = NULL; + int *buf = NULL; + int *data_ptr = NULL; + + int buf_size = BIG_DATABUFFER_SIZE; + int i = 0; + int j = 0; + int k = 0; + int ret_value = 0; /* for error handling */ + + dims[0] = BIG_DATABUFFER_SIZE; + dims[1] = (hsize_t)g_mpi_size; + block[0] = dims[0] / g_mpi_size; + block[1] = dims[1]; + + stride[0] = block[0]; + stride[1] = block[1]; + + start[0] = (hsize_t)(g_mpi_rank * block[0]); + start[1] = 0; + + buf_size *= block[0]; + + TESTING("Subfiling open and dataset writing"); + + /* Create FAPL for Ioc[sec2|subfiling] + */ + fapl_id = create_subfiling_ioc_fapl("basic_write", &names); + if (H5I_INVALID_HID == fapl_id) { + TEST_ERROR; + } + + /* set the MPI communicator and info in the FAPL */ + if (H5Pset_mpi_params(fapl_id, comm, info) < 0) + TEST_ERROR; + + /* Prepare data to be written + */ + check = (int *)HDmalloc(dims[0] * sizeof(int)); + if (NULL == check) { + TEST_ERROR; + } + + buf = (int *)HDmalloc(dims[0] * sizeof(int)); + if (NULL == buf) { + TEST_ERROR; + } + data_ptr = buf; + + for (i = 0; i < block[0]; i++) { + for (j = 0; j < block[1]; j++) { + *data_ptr = (int)((i + start[0]) * 100 + (j + start[1] + 1)); + data_ptr++; + } + } + + /* -------------------- */ + /* TEST: Create and Close */ + + file_id = H5Fcreate(names.rw, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + if (H5I_INVALID_HID == file_id) { + TEST_ERROR; + } + if (H5Fclose(file_id) == FAIL) { + TEST_ERROR; + } + file_id = H5I_INVALID_HID; + + /* -------------------- */ + /* TEST: Repoen and Write */ + + file_id = H5Fopen(names.rw, H5F_ACC_RDWR, fapl_id); + if (H5I_INVALID_HID == file_id) { + TEST_ERROR; + } + + dspace_id = H5Screate_simple(2, dims, NULL); + if (H5I_INVALID_HID == dspace_id) { + TEST_ERROR; + } + + dset_id = H5Dcreate2(file_id, "dataset", dtype_id, dspace_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + if (H5I_INVALID_HID == dset_id) { + TEST_ERROR; + } + + file_dataspace = H5Dget_space(dset_id); + if (H5Sselect_hyperslab(file_dataspace, H5S_SELECT_SET, start, stride, count, block) == FAIL) { + TEST_ERROR; + } + + /* create a memory dataspace independently */ + mem_dataspace = H5Screate_simple(2, block, NULL); + + if (H5Dwrite(dset_id, dtype_id, mem_dataspace, file_dataspace, H5P_DEFAULT, buf) == FAIL) { + TEST_ERROR; + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (H5Dread(dset_id, dtype_id, mem_dataspace, file_dataspace, H5P_DEFAULT, check) < 0) { + TEST_ERROR; + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (H5Dclose(dset_id) == FAIL) { + TEST_ERROR; + } + MPI_Barrier(MPI_COMM_WORLD); + + if (H5Sclose(dspace_id) == FAIL) { + TEST_ERROR; + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (H5Fclose(file_id) == FAIL) { + TEST_ERROR; + } + +#if 1 + for (i = 0; i < buf_size; i++) { + if (buf[i] != check[i]) { + printf("[%d] %d: buf = %d, expected = %d\n", g_mpi_rank, i, buf[i], check[i]); + break; + // TEST_ERROR; + } + } +#endif + /* -------------------- */ + /* Standard cleanup */ + /* -------------------- */ + + HDfree(buf); + buf = NULL; + HDfree(check); + check = NULL; + + PASSED(); + return 0; + +error: + H5E_BEGIN_TRY + { + (void)H5Fclose(file_id); + if (buf) { + HDfree(buf); + } + (void)H5Dclose(dset_id); + (void)H5Sclose(dspace_id); + if (fapl_id != H5P_DEFAULT && fapl_id > 0) { + (void)H5Pclose(fapl_id); + } + } + H5E_END_TRY; + return -1; +} /* end test_basic_dataset_write() */ + +#if 0 /* JRM */ + +/* --------------------------------------------------------------------------- + * Function: test_chunked_dataset_write + * + * Purpose: Create and close files; repoen files and write a dataset, + * close; compare files. + * + * TODO: receive target IP from caller? + * + * Return: Success: 0 + * Failure: -1 + * + * Programmer: Jacob Smith + * 2019 + * --------------------------------------------------------------------------- + */ +static int +test_chunked_dataset_write(void) +{ + struct subfilingtest_filenames names; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5P_DEFAULT; + + TESTING("Subfiling open and dataset writing (chunked)"); + + /* Create FAPL for Ioc[sec2|subfiling] + */ + fapl_id = create_subfiling_ioc_fapl("chunked_write", &names); + if (H5I_INVALID_HID == fapl_id) { + TEST_ERROR; + } + + /* -------------------- */ + /* TEST: Create and Close */ + + file_id = H5Fcreate(names.rw, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + if (H5I_INVALID_HID == file_id) { + TEST_ERROR; + } + if (H5Fclose(file_id) == FAIL) { + TEST_ERROR; + } + file_id = H5I_INVALID_HID; + + /* -------------------- */ + /* TEST: Reopen and Write */ + + file_id = H5Fopen(names.rw, H5F_ACC_RDWR, fapl_id); + if (H5I_INVALID_HID == file_id) { + TEST_ERROR; + } + + /* Write datasets to file + */ + if (create_datasets(file_id, 0, MAX_DSET_COUNT) == FAIL) { + TEST_ERROR; + } + + /* Close to 'flush to disk', and reopen file + */ + if (H5Fclose(file_id) == FAIL) { + TEST_ERROR; + } + file_id = H5I_INVALID_HID; + + MPI_Barrier(MPI_COMM_WORLD); + + /* Reopen file + */ + file_id = H5Fopen(names.rw, H5F_ACC_RDWR, fapl_id); + if (H5I_INVALID_HID == file_id) { + TEST_ERROR; + } + + MPI_Barrier(MPI_COMM_WORLD); + + /* Verify written data integrity + */ + if (verify_datasets(file_id, 0, MAX_DSET_COUNT) == FAIL) { + TEST_ERROR; + } + + /* -------------------- */ + /* Standard cleanup */ + + if (H5Fclose(file_id) == FAIL) { + TEST_ERROR; + } + file_id = H5I_INVALID_HID; + if (fapl_id != H5P_DEFAULT && fapl_id > 0) { + if (H5Pclose(fapl_id) == FAIL) { + TEST_ERROR; + } + fapl_id = H5I_INVALID_HID; + } + + PASSED(); + return 0; + +error: + H5E_BEGIN_TRY { + (void)H5Fclose(file_id); + if (fapl_id != H5P_DEFAULT && fapl_id > 0) { + (void)H5Pclose(fapl_id); + } + } H5E_END_TRY; + return -1; +} /* end test_chunked_dataset_write() */ +#endif /* JRM */ +#if 0 /* JRM */ + +/* --------------------------------------------------------------------------- + * Function: test_on_disk_zoo + * + * Purpose: Verify that the subfiling can handle the passing of all the + * various on-disk data structures over the wire, as implemented + * in genall5.c:create_zoo(). + * + * TODO: receive target IP from caller? + * + * Return: Success: 0 + * Failure: -1 + * + * Programmer: Jacob Smith + * 2019 + * --------------------------------------------------------------------------- + */ +static int +test_on_disk_zoo(void) +{ + const char grp_name[] = "/only"; + struct subfilingtest_filenames names; + hid_t file_id = H5I_INVALID_HID; + hid_t grp_id = H5I_INVALID_HID; + hid_t fapl_id = H5P_DEFAULT; + + TESTING("'Zoo' of on-disk structures"); + + /* Create FAPL for Ioc[sec2|subfiling] + */ + fapl_id = create_subfiling_ioc_fapl("zoo", &names); + if (H5I_INVALID_HID == fapl_id) { + TEST_ERROR; + } + + /* -------------------- */ + /* TEST: Create file */ + file_id = H5Fcreate(names.rw, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + if (H5I_INVALID_HID == file_id) { + TEST_ERROR; + } + + grp_id = H5Gcreate2(file_id, grp_name, H5P_DEFAULT, H5P_DEFAULT, + H5P_DEFAULT); + if (grp_id == H5I_INVALID_HID) { + TEST_ERROR; + } + + /* Create datasets in file, close (flush) and reopen, validate. + * Use of ( pass ) a conceit required for using create_ and validate_zoo() + * from cache_common and/or genall5. + */ + + if ( pass ) { + create_zoo(file_id, grp_name, 0); + } + if ( pass ) { + if (H5Fclose(file_id) == FAIL) { + TEST_ERROR; + } + file_id = H5Fopen(names.rw, H5F_ACC_RDWR, fapl_id); + if (H5I_INVALID_HID == file_id) { + TEST_ERROR; + } + } + if ( pass ) { + validate_zoo(file_id, grp_name, 0); /* sanity-check */ + } + if ( !pass ) { + HDprintf(failure_mssg); + TEST_ERROR; + } + + /* -------------------- */ + /* Standard cleanup */ + + if (fapl_id != H5P_DEFAULT && fapl_id >= 0) { + if (H5Pclose(fapl_id) == FAIL) { + TEST_ERROR; + } + } + if (H5Gclose(grp_id) == FAIL) { + TEST_ERROR; + } + if (H5Fclose(file_id) == FAIL) { + TEST_ERROR; + } + + /* -------------------- */ + /* TEST: Verify that the R/W and W/O files are identical */ + + if (h5_compare_file_bytes(names.rw, names.wo) < 0) { + TEST_ERROR; + } + + PASSED(); + return 0; + +error: + H5E_BEGIN_TRY { + (void)H5Fclose(file_id); + (void)H5Gclose(grp_id); + if (fapl_id != H5P_DEFAULT && fapl_id > 0) { + (void)H5Pclose(fapl_id); + } + } H5E_END_TRY; + return -1; +} /* end test_on_disk_zoo() */ +#endif /* JRM */ +#if 0 /* JRM */ + +/* --------------------------------------------------------------------------- + * Function: test_vanishing_datasets + * + * Purpose: Verify behavior when writing to a file where data is deleted. + * + * Each dataset is populated with the value of its suffix + * (dset5 is all fives). + * + * Opens 0..15 create one new dataset each, '/dset[i]'. + * Opens 3..18 delete '/dset[1-3]' + * + * Should end with no data in file. + * + * Return: Success: 0 + * Failure: -1 + * + * Programmer: Jacob Smith + * 2019 + * --------------------------------------------------------------------------- + */ +static int +test_vanishing_datasets(void) +{ + struct subfilingtest_filenames names; + hid_t file_id = H5I_INVALID_HID; + hid_t fapl_id = H5I_INVALID_HID; + hid_t dset_id = H5I_INVALID_HID; + hid_t dspace_id = H5I_INVALID_HID; + hid_t subfiling_fapl_id = H5I_INVALID_HID; + hsize_t dims[2] = {DATABUFFER_SIZE, DATABUFFER_SIZE}; + uint32_t buf[DATABUFFER_SIZE][DATABUFFER_SIZE]; /* consider malloc? */ + H5G_info_t group_info; + unsigned int i, j, k; + const unsigned int max_loops = 20; + const unsigned int max_at_one_time = 3; + + TESTING("Vanishing Datasets"); + + /* -------------------- */ + /* Set up recurrent data (FAPL, dataspace) */ + + /* Create FAPL for Ioc[sec2|subfiling] + */ + fapl_id = create_subfiling_ioc_fapl("vanishing", &names); + if (H5I_INVALID_HID == fapl_id) { + TEST_ERROR; + } + + dspace_id = H5Screate_simple(2, dims, NULL); + if (dspace_id < 0) { + TEST_ERROR; + } + + /* create file */ + file_id = H5Fcreate(names.rw, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id); + if (H5I_INVALID_HID == file_id) { + TEST_ERROR; + } + + for (i=0; i < max_loops; i++) { + char namebuf[DSET_NAME_LEN + 1]; + + /* deleting datasets */ + if (i >= max_at_one_time) { + if (HDsnprintf(namebuf, DSET_NAME_LEN, "/dset%02d", + (i - max_at_one_time) ) + > DSET_NAME_LEN) + { + TEST_ERROR; + } + if (H5Ldelete(file_id, namebuf, H5P_DEFAULT) < 0) { + TEST_ERROR; + } + } /* end if deleting a dataset */ + + /* writing datasets */ + if (i < (max_loops - max_at_one_time)) { + if (HDsnprintf(namebuf, DSET_NAME_LEN, "/dset%02d", i) + > DSET_NAME_LEN) + { + TEST_ERROR; + } + dset_id = H5Dcreate2(file_id, namebuf, H5T_STD_U32LE, dspace_id, + H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + if (H5I_INVALID_HID == dset_id) { + TEST_ERROR; + } + + for (j=0; j < DATABUFFER_SIZE; j++) { + for (k=0; k < DATABUFFER_SIZE; k++) { + buf[j][k] = (uint32_t)i; + } + } + + if (H5Dwrite(dset_id, H5T_STD_U32LE, H5S_ALL, H5S_ALL, H5P_DEFAULT, + buf) + < 0) + { + TEST_ERROR; + } + + if (H5Dclose(dset_id) < 0) { + TEST_ERROR; + } + dset_id = H5I_INVALID_HID; + } /* end if writing a dataset */ + + } /* end for dataset create-destroy cycles */ + + if (H5Fclose(file_id) < 0) { + TEST_ERROR; + } + file_id = H5I_INVALID_HID; + + /* verify there are no datasets in file */ + file_id = H5Fopen(names.rw, H5F_ACC_RDONLY, H5P_DEFAULT); + if (file_id < 0) { + TEST_ERROR; + } + if (H5Gget_info(file_id, &group_info) < 0) { + TEST_ERROR; + } + if (group_info.nlinks > 0) { + HDfprintf(stderr, "links in rw file: %d\n", group_info.nlinks); + HDfflush(stderr); + TEST_ERROR; + } + if (H5Fclose(file_id) < 0) { + TEST_ERROR; + } + file_id = H5Fopen(names.wo, H5F_ACC_RDONLY, H5P_DEFAULT); + if (file_id < 0) { + TEST_ERROR; + } + if (H5Gget_info(file_id, &group_info) < 0) { + TEST_ERROR; + } + if (group_info.nlinks > 0) { + HDfprintf(stderr, "links in wo file: %d\n", group_info.nlinks); + HDfflush(stderr); + TEST_ERROR; + } + if (H5Fclose(file_id) < 0) { + TEST_ERROR; + } + file_id = H5I_INVALID_HID; + + if (h5_compare_file_bytes(names.rw, names.wo) < 0) + TEST_ERROR; + + /* -------------------- */ + /* Teardown */ + + if (H5Sclose(dspace_id) < 0) { + TEST_ERROR; + } + if (H5Pclose(fapl_id) < 0) { + TEST_ERROR; + } + + PASSED(); + return 0; + +error: + H5E_BEGIN_TRY { + if (subfiling_fapl_id != H5I_INVALID_HID) { + H5Pclose(subfiling_fapl_id); + } + if (fapl_id != H5I_INVALID_HID) { + H5Pclose(fapl_id); + } + if (file_id != H5I_INVALID_HID) { + H5Fclose(file_id); + } + if (dset_id != H5I_INVALID_HID) { + H5Dclose(dset_id); + } + if (dspace_id != H5I_INVALID_HID) { + H5Sclose(dspace_id); + } + } H5E_END_TRY; + return -1; +} /* test_vanishing_datasets() */ +#endif /* JRM */ +#if 0 /* JRM */ + +/* --------------------------------------------------------------------------- + * Function: test_concurrent_access + * + * Purpose: Verify that more than one file may be opened at a time. + * + * TODO: receive target IP from caller? + * + * Return: Success: 0 + * Failure: -1 + * + * Programmer: Jacob Smith + * 2020-03-09 + * --------------------------------------------------------------------------- + */ +static int +test_concurrent_access(void) +{ + struct file_bundle { + struct subfilingtest_filenames names; + hid_t dset_id; + hid_t fapl_id; + hid_t file_id; + } bundle[CONCURRENT_COUNT]; + hid_t dspace_id = H5I_INVALID_HID; + hid_t dtype_id = H5T_NATIVE_INT; + hsize_t dims[2] = { DATABUFFER_SIZE, DATABUFFER_SIZE }; + int *buf = NULL; + int i = 0; + int j = 0; + + TESTING("Concurrent opened subfilinged files"); + + /* blank bundle */ + for (i = 0; i < CONCURRENT_COUNT; i++) { + bundle[i].dset_id = H5I_INVALID_HID; + bundle[i].fapl_id = H5I_INVALID_HID; + bundle[i].file_id = H5I_INVALID_HID; + *bundle[i].names.rw = '\0'; + *bundle[i].names.wo = '\0'; + *bundle[i].names.log = '\0'; + } + + /* Create FAPL for Ioc[sec2|subfiling] + */ + for (i = 0; i < CONCURRENT_COUNT; i++) { + char _name[16] = ""; + hid_t _fapl_id = H5I_INVALID_HID; + HDsnprintf(_name, 15, "concurrent%d", i); + _fapl_id = create_subfiling_ioc_fapl(_name, &bundle[i].names); + if (H5I_INVALID_HID == _fapl_id) { + TEST_ERROR; + } + bundle[i].fapl_id = _fapl_id; + } + + /* Prepare data to be written + */ + buf = (int *)HDmalloc(DATABUFFER_SIZE * DATABUFFER_SIZE * sizeof(int)); + if (NULL == buf) { + TEST_ERROR; + } + for (i = 0; i < DATABUFFER_SIZE; i++) { + for (j = 0; j < DATABUFFER_SIZE; j++) { + int k = i * DATABUFFER_SIZE + j; + buf[k] = k; + } + } + + /* Prepare generic dataspace + */ + dspace_id = H5Screate_simple(2, dims, NULL); + if (H5I_INVALID_HID == dspace_id) { + TEST_ERROR; + } + + /* -------------------- */ + /* TEST: Create file and open elements */ + + for (i = 0; i < CONCURRENT_COUNT; i++) { + hid_t _file_id = H5I_INVALID_HID; + hid_t _dset_id = H5I_INVALID_HID; + + _file_id = H5Fcreate(bundle[i].names.rw, H5F_ACC_TRUNC, H5P_DEFAULT, + bundle[i].fapl_id); + if (H5I_INVALID_HID == _file_id) { + TEST_ERROR; + } + + bundle[i].file_id = _file_id; + + _dset_id = H5Dcreate2(_file_id, "dataset", dtype_id, dspace_id, + H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); + if (H5I_INVALID_HID == _dset_id) { + TEST_ERROR; + } + bundle[i].dset_id = _dset_id; + } + + /* -------------------- */ + /* TEST: Write to files */ + + for (i = 0; i < CONCURRENT_COUNT; i++) { + if (H5Dwrite(bundle[i].dset_id, dtype_id, H5S_ALL, H5S_ALL, + H5P_DEFAULT, buf) + == FAIL) + { + TEST_ERROR; + } + } + + /* -------------------- */ + /* TEST: Close elements */ + + for (i = 0; i < CONCURRENT_COUNT; i++) { + if (H5Dclose(bundle[i].dset_id) == FAIL) { + TEST_ERROR; + } + if (H5Fclose(bundle[i].file_id) == FAIL) { + TEST_ERROR; + } + if (H5Pclose(bundle[i].fapl_id) == FAIL) { + TEST_ERROR; + } + } + + /* -------------------- */ + /* Standard cleanup */ + + HDfree(buf); + buf = NULL; + if (H5Sclose(dspace_id) == FAIL) { + TEST_ERROR; + } + + /* -------------------- */ + /* TEST: Verify that the R/W and W/O files are identical */ + + for (i = 0; i < CONCURRENT_COUNT; i++) { + if (h5_compare_file_bytes(bundle[i].names.rw, bundle[i].names.wo) < 0) { + TEST_ERROR; + } + } + + PASSED(); + return 0; + +error: + H5E_BEGIN_TRY{ + if (buf) { + HDfree(buf); + } + (void)H5Sclose(dspace_id); + for (i = 0; i < CONCURRENT_COUNT; i++) { + (void)H5Dclose(bundle[i].dset_id); + (void)H5Fclose(bundle[i].file_id); + (void)H5Pclose(bundle[i].fapl_id); + } + } H5E_END_TRY; + return -1; +} /* end test_concurrent_access() */ +#endif /* JRM */ + +/* --------------------------------------------------------------------------- + * Function: main + * + * Purpose: Run tests. + * + * Return: Success: 0 + * Failure: 1 + * + * Programmer: Jacob Smith + * 2019 + * --------------------------------------------------------------------------- + */ +extern hbool_t H5_use_selection_io_g; + +int +main(int argc, char **argv) +{ + int nerrors = 0; + int required = MPI_THREAD_MULTIPLE; + int provided = 0; + + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided != required) { + HDprintf("MPI doesn't support MPI_Init_thread with MPI_THREAD_MULTIPLE\n"); + return -1; + } + else { + MPI_Comm_rank(MPI_COMM_WORLD, &g_mpi_rank); + MPI_Comm_size(MPI_COMM_WORLD, &g_mpi_size); + } + + h5_reset(); + + H5_use_selection_io_g = TRUE; + + g_log_stream = stdout; /* default debug/logging output stream */ + + HDprintf("Testing Subfiling VFD functionality.\n"); + + /* -------------------- */ + /* SETUP */ + + /* Create directories for test-generated .h5 files + */ + if (nerrors == 0) { + if ((HDmkdir(SUBFILING_RW_DIR, (mode_t)0755) < 0) && (errno != EEXIST)) { + nerrors++; + } + } + if (nerrors == 0) { + if ((HDmkdir(SUBFILING_WO_DIR, (mode_t)0755) < 0) && (errno != EEXIST)) { + nerrors++; + } + } + + /* -------------------- */ + /* TESTS */ + /* Tests return negative values; `-=' increments nerrors count */ + +#if 0 /* JRM */ + HDfprintf(stdout, "waiting for attach...\n"); + sleep(60); +#endif /* JRM */ + + if (nerrors == 0) { + nerrors -= test_fapl_configuration(); + +#if 1 /* JRM */ /* skip remaining tests for now since they hang */ + { + int mpi_rank; + + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + if (mpi_rank == 0) { + SKIPPED(); + HDputs(" Skipping remaining eests due to hang -- remove this skip to reproduce "); + } + } + MPI_Finalize(); + HDexit(EXIT_FAILURE); +#endif /* JRM */ /* skip remaining test for now since they hang */ + + nerrors -= test_create_and_close(); + nerrors -= test_basic_dataset_write(); +#if 0 + nerrors -= test_chunked_dataset_write(); + nerrors -= test_on_disk_zoo(); + nerrors -= test_vanishing_datasets(); + nerrors -= test_concurrent_access(); +#endif + } + + if (nerrors) { + HDprintf("***** %d Subfiling VFD TEST%s FAILED! *****\n", nerrors, nerrors > 1 ? "S" : ""); + return EXIT_FAILURE; + } + + HDprintf("All Subfiling Virtual File Driver tests passed.\n"); + + MPI_Finalize(); + return EXIT_SUCCESS; +} /* end main() */ + +#else /* H5_HAVE_SUBFILING_VFD */ + +int +main(void) +{ + h5_reset(); + HDprintf("Testing Subfiling VFD functionality.\n"); + HDprintf("SKIPPED - Subfiling VFD not built.\n"); + return EXIT_SUCCESS; +} + +#endif /* H5_HAVE_SUBFILING_VFD */ diff --git a/testpar/t_vfd.c b/testpar/t_vfd.c index 09a7103afbb..fe8683b303b 100644 --- a/testpar/t_vfd.c +++ b/testpar/t_vfd.c @@ -18,21 +18,32 @@ */ #include "testphdf5.h" +#include "H5FDsubfiling.h" +#include "H5FDioc.h" /* Must be a power of 2. Reducing it below 1024 may cause problems */ #define INTS_PER_RANK 1024 /* global variable declarations: */ -hbool_t pass = TRUE; /* set to FALSE on error */ -const char *failure_mssg = NULL; - -const char *FILENAMES[] = {"mpio_vfd_test_file_0", /*0*/ - "mpio_vfd_test_file_1", /*1*/ - "mpio_vfd_test_file_2", /*2*/ - "mpio_vfd_test_file_3", /*3*/ - "mpio_vfd_test_file_4", /*4*/ - "mpio_vfd_test_file_5", /*5*/ +hbool_t pass = TRUE; /* set to FALSE on error */ +hbool_t disp_failure_mssgs = TRUE; /* global force display of failure messages */ +const char *failure_mssg = NULL; + +const char *FILENAMES[] = {"mpio_vfd_test_file_0", /*0*/ + "mpio_vfd_test_file_1", /*1*/ + "mpio_vfd_test_file_2", /*2*/ + "mpio_vfd_test_file_3", /*3*/ + "mpio_vfd_test_file_4", /*4*/ + "mpio_vfd_test_file_5", /*5*/ + "mpio_vfd_test_file_6", /*6*/ + "subfiling_vfd_test_file_0", /*7*/ + "subfiling_vfd_test_file_1", /*8*/ + "subfiling_vfd_test_file_2", /*9*/ + "subfiling_vfd_test_file_3", /*10*/ + "subfiling_vfd_test_file_4", /*11*/ + "subfiling_vfd_test_file_5", /*12*/ + "subfiling_vfd_test_file_6", /*13*/ NULL}; /* File Test Images @@ -84,6 +95,8 @@ static unsigned vector_write_test_5(int file_name_id, int mpi_rank, int mpi_size H5FD_mpio_collective_opt_t coll_opt_mode, const char *vfd_name); static unsigned vector_write_test_6(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_t xfer_mode, H5FD_mpio_collective_opt_t coll_opt_mode, const char *vfd_name); +static unsigned vector_write_test_7(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_t xfer_mode, + H5FD_mpio_collective_opt_t coll_opt_mode, const char *vfd_name); /****************************************************************************/ /***************************** Utility Functions ****************************/ @@ -246,7 +259,7 @@ free_file_images(void) * * Modifications: * - * None. + * Updated for subfiling VFD 9/29/30 * *------------------------------------------------------------------------- */ @@ -270,6 +283,20 @@ setup_vfd_test_file(int file_name_id, char *file_name, int mpi_size, H5FD_mpio_x HDassert(fapl_id_ptr); HDassert(dxpl_id_ptr); + if (show_progress) + HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); + + /* setup the file name -- do this now, since setting up the ioc faple requires it. This will probably + * change */ + if (pass) { + + if (h5_fixname(FILENAMES[file_name_id], H5P_DEFAULT, filename, sizeof(filename)) == NULL) { + + pass = FALSE; + failure_mssg = "h5_fixname() failed.\n"; + } + } + if (show_progress) HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); @@ -293,8 +320,110 @@ setup_vfd_test_file(int file_name_id, char *file_name, int mpi_size, H5FD_mpio_x failure_mssg = "Can't set mpio fapl."; } } - else { + else if (strcmp(vfd_name, "subfiling") == 0) { + + hid_t ioc_fapl; + H5FD_ioc_config_t ioc_config = {{ + /* common */ + /* magic = */ H5FD_IOC_FAPL_T_MAGIC, + /* version = */ H5FD_CURR_IOC_FAPL_T_VERSION, + /* stripe_count = */ 0, /* will over write */ + /* stripe_depth = */ (INTS_PER_RANK / 2), + /* ioc_selection = */ SELECT_IOC_ONE_PER_NODE, + /* ioc_fapl_id = */ H5P_DEFAULT, /* will over write? */ + /* context_id = */ 0, /* will overwrite */ + /* file_dir = */ "", /* will overwrite */ + /* file_path = */ "" /* will overwrite */ + }, + /* thread_pool_count = */ H5FD_IOC_THREAD_POOL_SIZE}; + H5FD_subfiling_config_t subfiling_conf = { + { + /* common */ + /* magic = */ H5FD_IOC_FAPL_T_MAGIC, + /* version = */ H5FD_CURR_IOC_FAPL_T_VERSION, + /* stripe_count = */ 0, /* will over write */ + /* stripe_depth = */ (INTS_PER_RANK / 2), + /* ioc_selection = */ SELECT_IOC_ONE_PER_NODE, + /* ioc_fapl_id = */ H5P_DEFAULT, /* will over write? */ + /* context_id = */ 0, /* will overwrite */ + /* file_dir = */ "", /* will overwrite */ + /* file_path = */ "", /* will overwrite */ + }, + /* require_ioc = */ TRUE}; + + if ((pass) && ((ioc_fapl = H5Pcreate(H5P_FILE_ACCESS)) < 0)) { + + pass = FALSE; + failure_mssg = "Can't create ioc fapl."; + } + +#if 1 /* JRM */ /* this is temporary -- rework for programmatic control later */ + memset(&ioc_config, 0, sizeof(ioc_config)); + memset(&subfiling_conf, 0, sizeof(subfiling_conf)); + + /* Get subfiling VFD defaults */ + if ((pass) && (H5Pget_fapl_subfiling(fapl_id, &subfiling_conf) == FAIL)) { + + pass = FALSE; + failure_mssg = "Can't get sub-filing VFD defaults."; + } + + if ((pass) && (subfiling_conf.require_ioc)) { + + /* Get IOC VFD defaults */ + if ((pass) && ((H5Pget_fapl_ioc(ioc_fapl, &ioc_config) == FAIL))) { + + pass = FALSE; + failure_mssg = "Can't get IOC VFD defaults."; + } + + /* Now we can set the IOC fapl. */ + if ((pass) && ((H5Pset_fapl_ioc(ioc_fapl, &ioc_config) == FAIL))) { + + pass = FALSE; + failure_mssg = "Can't set IOC fapl."; + } + } + else { + + if ((pass) && ((H5Pset_fapl_sec2(ioc_fapl) == FAIL))) { + + pass = FALSE; + failure_mssg = "Can't set sec2 fapl."; + } + } + + /* Assign the IOC fapl as the underlying VPD */ + subfiling_conf.common.ioc_fapl_id = ioc_fapl; + + if (pass) { /* setup the paths in the subfiling fapl. */ + + HDassert(strlen(filename) < sizeof(subfiling_conf.common.file_dir)); + strcpy(subfiling_conf.common.file_dir, dirname(filename)); + strcpy(subfiling_conf.common.file_path, basename(filename)); +#if 0 /* JRM */ + HDfprintf(stdout, "\nfilename = \"%s\"\nfile_dir = \"%s\"\nfile_path = \"%s\"\n", + filename, subfiling_conf.common.file_dir, subfiling_conf.common.file_path); +#endif /* JRM */ + } + /* Now we can set the SUBFILING fapl before returning. */ + if ((pass) && (H5Pset_fapl_subfiling(fapl_id, &subfiling_conf) == FAIL)) { + + pass = FALSE; + failure_mssg = "Can't set subfiling fapl."; + } + +#endif /* JRM */ + + /* set the MPI communicator and info in the FAPL */ + if (H5Pset_mpi_params(fapl_id, MPI_COMM_WORLD, MPI_INFO_NULL) < 0) { + + pass = FALSE; + failure_mssg = "Can't set MPI communicator and info in subfiling fapl."; + } + } + else { pass = FALSE; failure_mssg = "un-supported VFD"; } @@ -705,11 +834,7 @@ vector_read_test_1(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_ /* 6) Close the test file and delete it (on rank 0 only). * Close FAPL and DXPL. */ - - if (pass) { - - takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); - } + takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); if (show_progress) HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); @@ -725,7 +850,7 @@ vector_read_test_1(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_ H5_FAILED(); - if (show_progress) { + if ((disp_failure_mssgs) || (show_progress)) { HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", fcn_name, failure_mssg); } } @@ -1016,11 +1141,7 @@ vector_read_test_2(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_ /* 10) Close the test file and delete it (on rank 0 only). * Close FAPL and DXPL. */ - - if (pass) { - - takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); - } + takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); if (show_progress) HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); @@ -1036,7 +1157,7 @@ vector_read_test_2(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_ H5_FAILED(); - if (show_progress) { + if ((disp_failure_mssgs) || (show_progress)) { HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", fcn_name, failure_mssg); } } @@ -1361,11 +1482,7 @@ vector_read_test_3(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_ /* 8) Close the test file and delete it (on rank 0 only). * Close FAPL and DXPL. */ - - if (pass) { - - takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); - } + takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); if (show_progress) HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); @@ -1381,7 +1498,7 @@ vector_read_test_3(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_ H5_FAILED(); - if (show_progress) { + if ((disp_failure_mssgs) || (show_progress)) { HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", fcn_name, failure_mssg); } } @@ -1836,11 +1953,7 @@ vector_read_test_4(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_ /* 8) Close the test file and delete it (on rank 0 only). * Close FAPL and DXPL. */ - - if (pass) { - - takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); - } + takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); if (show_progress) HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); @@ -1856,7 +1969,7 @@ vector_read_test_4(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_ H5_FAILED(); - if (show_progress) { + if ((disp_failure_mssgs) || (show_progress)) { HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", fcn_name, failure_mssg); } } @@ -2116,11 +2229,7 @@ vector_read_test_5(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_ /* 8) Close the test file and delete it (on rank 0 only). * Close FAPL and DXPL. */ - - if (pass) { - - takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); - } + takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); if (show_progress) HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); @@ -2136,7 +2245,7 @@ vector_read_test_5(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_ H5_FAILED(); - if (show_progress) { + if ((disp_failure_mssgs) || (show_progress)) { HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", fcn_name, failure_mssg); } } @@ -2198,7 +2307,7 @@ vector_write_test_1(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer H5FD_mem_t types[1]; haddr_t addrs[1]; size_t sizes[1]; - void * bufs[1]; + const void *bufs[1]; pass = TRUE; @@ -2268,6 +2377,9 @@ vector_write_test_1(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer if (pass) { MPI_Barrier(MPI_COMM_WORLD); +#if 0 /* JRM */ /* test code -- remove before commit */ + sleep(1); +#endif /* JRM */ } if (show_progress) @@ -2305,11 +2417,7 @@ vector_write_test_1(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer /* 5) Close the test file and delete it (on rank 0 only). * Close FAPL and DXPL. */ - - if (pass) { - - takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); - } + takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); if (show_progress) HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); @@ -2325,7 +2433,7 @@ vector_write_test_1(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer H5_FAILED(); - if (show_progress) { + if ((disp_failure_mssgs) || (show_progress)) { HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", fcn_name, failure_mssg); } } @@ -2397,7 +2505,7 @@ vector_write_test_2(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer H5FD_mem_t types[1]; haddr_t addrs[1]; size_t sizes[1]; - void * bufs[1]; + const void *bufs[1]; pass = TRUE; @@ -2514,6 +2622,9 @@ vector_write_test_2(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer if (pass) { MPI_Barrier(MPI_COMM_WORLD); +#if 0 /* JRM */ /* test code -- remove before commit */ + sleep(1); +#endif /* JRM */ } if (show_progress) @@ -2569,11 +2680,7 @@ vector_write_test_2(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer /* 6) Close the test file and delete it (on rank 0 only). * Close FAPL and DXPL. */ - - if (pass) { - - takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); - } + takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); if (show_progress) HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); @@ -2589,7 +2696,7 @@ vector_write_test_2(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer H5_FAILED(); - if (show_progress) { + if ((disp_failure_mssgs) || (show_progress)) { HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", fcn_name, failure_mssg); } } @@ -2662,7 +2769,7 @@ vector_write_test_3(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer H5FD_mem_t types[4]; haddr_t addrs[4]; size_t sizes[4]; - void * bufs[4]; + const void *bufs[4]; pass = TRUE; @@ -2765,6 +2872,9 @@ vector_write_test_3(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer if (pass) { MPI_Barrier(MPI_COMM_WORLD); +#if 0 /* JRM */ /* test code -- remove before commit */ + sleep(1); +#endif /* JRM */ } if (show_progress) @@ -2845,11 +2955,7 @@ vector_write_test_3(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer /* 5) Close the test file and delete it (on rank 0 only). * Close FAPL and DXPL. */ - - if (pass) { - - takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); - } + takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); if (show_progress) HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); @@ -2865,7 +2971,7 @@ vector_write_test_3(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer H5_FAILED(); - if (show_progress) { + if ((disp_failure_mssgs) || (show_progress)) { HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", fcn_name, failure_mssg); } } @@ -2944,7 +3050,7 @@ vector_write_test_4(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer H5FD_mem_t types[4]; haddr_t addrs[4]; size_t sizes[4]; - void * bufs[4]; + const void *bufs[4]; pass = TRUE; @@ -3047,6 +3153,9 @@ vector_write_test_4(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer if (pass) { MPI_Barrier(MPI_COMM_WORLD); +#if 0 /* JRM */ /* test code -- remove before commit */ + sleep(1); +#endif /* JRM */ } if (show_progress) @@ -3127,11 +3236,7 @@ vector_write_test_4(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer /* 5) Close the test file and delete it (on rank 0 only). * Close FAPL and DXPL. */ - - if (pass) { - - takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); - } + takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); if (show_progress) HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); @@ -3147,7 +3252,7 @@ vector_write_test_4(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer H5_FAILED(); - if (show_progress) { + if ((disp_failure_mssgs) || (show_progress)) { HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", fcn_name, failure_mssg); } } @@ -3262,7 +3367,7 @@ vector_write_test_5(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer H5FD_mem_t types[4]; haddr_t addrs[4]; size_t sizes[4]; - void * bufs[4]; + const void *bufs[4]; pass = TRUE; @@ -3451,6 +3556,9 @@ vector_write_test_5(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer if (pass) { MPI_Barrier(MPI_COMM_WORLD); +#if 0 /* JRM */ /* test code -- remove before commit */ + sleep(1); +#endif /* JRM */ } if (show_progress) @@ -3490,6 +3598,10 @@ vector_write_test_5(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer pass = FALSE; failure_mssg = "unexpected data read from file (1.1)"; +#if 1 /* JRM */ + HDprintf("\nread_fi_buf[%d] = %d, %d expected.\n", j, read_fi_buf[j], + negative_fi_buf[j]); +#endif /* JRM */ } } else if (((INTS_PER_RANK / 4) <= k) && (k < (3 * (INTS_PER_RANK / 8)))) { @@ -3498,6 +3610,10 @@ vector_write_test_5(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer pass = FALSE; failure_mssg = "unexpected data read from file (1.2)"; +#if 1 /* JRM */ + HDprintf("\nread_fi_buf[%d] = %d, %d expected.\n", j, read_fi_buf[j], + decreasing_fi_buf[j]); +#endif /* JRM */ } } else if (((INTS_PER_RANK / 16) <= k) && (k < (INTS_PER_RANK / 8))) { @@ -3506,6 +3622,10 @@ vector_write_test_5(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer pass = FALSE; failure_mssg = "unexpected data read from file (1.3)"; +#if 1 /* JRM */ + HDprintf("\nread_fi_buf[%d] = %d, %d expected.\n", j, read_fi_buf[j], + increasing_fi_buf[j]); +#endif /* JRM */ } } else { @@ -3525,6 +3645,10 @@ vector_write_test_5(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer pass = FALSE; failure_mssg = "unexpected data read from file (2.1)"; +#if 1 /* JRM */ + HDprintf("\nread_fi_buf[%d] = %d, %d expected.\n", j, read_fi_buf[j], + increasing_fi_buf[j]); +#endif /* JRM */ } } else if ((((INTS_PER_RANK / 2) + 1) <= k) && (k <= (INTS_PER_RANK - 2))) { @@ -3533,6 +3657,10 @@ vector_write_test_5(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer pass = FALSE; failure_mssg = "unexpected data read from file (2.2)"; +#if 1 /* JRM */ + HDprintf("\nread_fi_buf[%d] = %d, %d expected.\n", j, read_fi_buf[j], + decreasing_fi_buf[j]); +#endif /* JRM */ } } else { @@ -3552,6 +3680,10 @@ vector_write_test_5(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer pass = FALSE; failure_mssg = "unexpected data read from file (3.1)"; +#if 1 /* JRM */ + HDprintf("\nread_fi_buf[%d] = %d, %d expected.\n", j, read_fi_buf[j], + negative_fi_buf[j]); +#endif /* JRM */ } } else { @@ -3586,11 +3718,7 @@ vector_write_test_5(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer /* 7) Close the test file and delete it (on rank 0 only). * Close FAPL and DXPL. */ - - if (pass) { - - takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); - } + takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); if (show_progress) HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); @@ -3606,7 +3734,7 @@ vector_write_test_5(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer H5_FAILED(); - if (show_progress) { + if ((disp_failure_mssgs) || (show_progress)) { HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", fcn_name, failure_mssg); } } @@ -3694,7 +3822,7 @@ vector_write_test_6(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer H5FD_mem_t types[(INTS_PER_RANK / 16) + 1]; haddr_t addrs[(INTS_PER_RANK / 16) + 1]; size_t sizes[2]; - void * bufs[(INTS_PER_RANK / 16) + 1]; + const void *bufs[(INTS_PER_RANK / 16) + 1]; pass = TRUE; @@ -3811,6 +3939,9 @@ vector_write_test_6(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer if (pass) { MPI_Barrier(MPI_COMM_WORLD); +#if 0 /* JRM */ /* test code -- remove before commit */ + sleep(1); +#endif /* JRM */ } if (show_progress) @@ -3865,12 +3996,287 @@ vector_write_test_6(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer /* 8) Close the test file and delete it (on rank 0 only). * Close FAPL and DXPL. */ + takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); + + if (show_progress) + HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); + + /* report results */ + if (mpi_rank == 0) { + + if (pass) { + + PASSED(); + } + else { + + H5_FAILED(); + + if ((disp_failure_mssgs) || (show_progress)) { + HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", fcn_name, failure_mssg); + } + } + } + + return (!pass); + +} /* vector_write_test_6() */ + +/*------------------------------------------------------------------------- + * Function: vector_write_test_7() + * + * Purpose: Test vector I/O with larger vectors -- 8 elements in each + * vector for now. + * + * 1) Open the test file with the specified VFD, and set + * the eoa. + * + * 2) Set the test file in a known state by writing zeros + * to all bytes in the test file. Since we have already + * tested this, do this via a vector write of zero_fi_buf. + * + * 3) Barrier + * + * 4) For each rank, define base_index equal to: + * + * mpi_rank * INTS_PER_RANK + * + * and define base_addr equal to + * + * base_index * sizeof(int32_t). + * + * Setup a vector of length 8, with each element of + * length INTS_PER_RANK / 16, and base address + * base_addr + i * (INTS_PER_RANK / 8), where i is + * the index of the entry (starting at zero). Draw + * written data from the equivalent locations in + * increasing_fi_buf. + * + * Write the vector. + * + * 5) Barrier + * + * 6) On each rank, read the entire file into the read_fi_buf, + * and compare against zero_fi_buf, and increasing_fi_buf as + * appropriate. Report failure if any differences are + * detected. + * + * 7) Close the test file. On rank 0, delete the test file. + * + * Return: FALSE on success, TRUE if any errors are detected. + * + * Programmer: John Mainzer + * 10/10/21 + * + * Modifications: + * + * None. + * + *------------------------------------------------------------------------- + */ + +static unsigned +vector_write_test_7(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer_t xfer_mode, + H5FD_mpio_collective_opt_t coll_opt_mode, const char *vfd_name) +{ + const char *fcn_name = "vector_write_test_7()"; + char test_title[120]; + char filename[512]; + haddr_t base_addr; + haddr_t addr_increment; + int base_index; + haddr_t eoa; + hbool_t show_progress = FALSE; + hid_t fapl_id = -1; /* file access property list ID */ + hid_t dxpl_id = -1; /* data access property list ID */ + H5FD_t * lf = NULL; /* VFD struct ptr */ + int cp = 0; + int i; + int j; + int k; + uint32_t count; + H5FD_mem_t types[8]; + haddr_t addrs[8]; + size_t sizes[8]; + const void *bufs[8]; + + pass = TRUE; + + if (mpi_rank == 0) { + + if (xfer_mode == H5FD_MPIO_INDEPENDENT) { + + sprintf(test_title, "parallel vector write test 7 -- %s / independent", vfd_name); + } + else if (coll_opt_mode == H5FD_MPIO_INDIVIDUAL_IO) { + + sprintf(test_title, "parallel vector write test 7 -- %s / col op / ind I/O", vfd_name); + } + else { + + HDassert(coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO); + + sprintf(test_title, "parallel vector write test 7 -- %s / col op / col I/O", vfd_name); + } + + TESTING(test_title); + } + + show_progress = ((show_progress) && (mpi_rank == 0)); + + if (show_progress) + HDfprintf(stdout, "\n%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); + + /* 1) Open the test file with the specified VFD, set the eoa, and setup the dxpl */ + if (pass) { + + eoa = (haddr_t)mpi_size * (haddr_t)INTS_PER_RANK * (haddr_t)(sizeof(int32_t)); + + setup_vfd_test_file(file_name_id, filename, mpi_size, xfer_mode, coll_opt_mode, vfd_name, eoa, &lf, + &fapl_id, &dxpl_id); + } + + if (show_progress) + HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); + + /* 2) Set the test file in a known state by writing zeros + * to all bytes in the test file. Since we have already + * tested this, do this via a vector write of zero_fi_buf. + */ + if (pass) { + + count = 1; + types[0] = H5FD_MEM_DRAW; + addrs[0] = (haddr_t)mpi_rank * (haddr_t)INTS_PER_RANK * (haddr_t)(sizeof(int32_t)); + sizes[0] = (size_t)INTS_PER_RANK * sizeof(int32_t); + bufs[0] = (void *)(&(zero_fi_buf[mpi_rank * INTS_PER_RANK])); + + if (H5FDwrite_vector(lf, dxpl_id, count, types, addrs, sizes, bufs) < 0) { + + pass = FALSE; + failure_mssg = "H5FDwrite_vector() failed.\n"; + } + } + + if (show_progress) + HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); + + /* 3) Barrier + */ + + if (pass) { + + MPI_Barrier(MPI_COMM_WORLD); +#if 0 /* JRM */ /* test code -- remove before commit */ + sleep(1); +#endif /* JRM */ + } + + if (show_progress) + HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); + + if (pass) { + + base_index = mpi_rank * INTS_PER_RANK; + base_addr = (haddr_t)((size_t)base_index * sizeof(int32_t)); + addr_increment = (haddr_t)((INTS_PER_RANK / 8) * sizeof(int32_t)); + + count = 8; + + for (i = 0; i < (int)count; i++) { + + types[i] = H5FD_MEM_DRAW; + addrs[i] = base_addr + ((haddr_t)(i)*addr_increment); + sizes[i] = (size_t)(INTS_PER_RANK / 16) * sizeof(int32_t); + bufs[i] = (void *)(&(increasing_fi_buf[base_index + (i * (INTS_PER_RANK / 8))])); + +#if 0 /* JRM */ /* delete eventually */ + HDfprintf(stderr, "\naddrs[%d] = %lld\n", i, (long long)(addrs[i])); +#endif /* JRM */ + } + + if (H5FDwrite_vector(lf, dxpl_id, count, types, addrs, sizes, bufs) < 0) { + + pass = FALSE; + failure_mssg = "H5FDwrite_vector() failed (1).\n"; + } + } + + if (show_progress) + HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); + + /* 5) Barrier */ + + if (pass) { + + MPI_Barrier(MPI_COMM_WORLD); +#if 0 /* JRM */ /* test code -- remove before commit */ + sleep(1); +#endif /* JRM */ + } + + if (show_progress) + HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); + + /* 6) On each rank, read the entire file into the read_fi_buf, + * and compare against increasing_fi_buf, and zero_fi_buf as + * appropriate. Report failure if any differences are + * detected. + */ if (pass) { - takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); + size_t image_size = (size_t)mpi_size * (size_t)INTS_PER_RANK * sizeof(int32_t); + + if (H5FDread(lf, H5FD_MEM_DRAW, H5P_DEFAULT, (haddr_t)0, image_size, (void *)read_fi_buf) < 0) { + + pass = FALSE; + failure_mssg = "H5FDread() failed.\n"; + } + + for (i = 0; ((pass) && (i < mpi_size)); i++) { + + base_index = i * INTS_PER_RANK; + + for (j = base_index; j < base_index + INTS_PER_RANK; j++) { + + k = j - base_index; + + if ((k % (INTS_PER_RANK / 8)) < (INTS_PER_RANK / 16)) { + + if (read_fi_buf[j] != increasing_fi_buf[j]) { + + pass = FALSE; + failure_mssg = "unexpected data read from file (1)"; +#if 1 /* JRM */ + HDprintf("\nread_fi_buf[%d] = %d, %d expected.\n", j, read_fi_buf[j], + increasing_fi_buf[j]); +#endif /* JRM */ + } + } + else { + + if (read_fi_buf[j] != 0) { + + pass = FALSE; + failure_mssg = "unexpected data read from file (2)"; +#if 1 /* JRM */ + HDprintf("\nread_fi_buf[%d] = %d, 0 expected.\n", j, read_fi_buf[j]); +#endif /* JRM */ + } + } + } + } } + if (show_progress) + HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); + + /* 7) Close the test file and delete it (on rank 0 only). + * Close FAPL and DXPL. + */ + takedown_vfd_test_file(mpi_rank, filename, &lf, &fapl_id, &dxpl_id); + if (show_progress) HDfprintf(stdout, "%s: cp = %d, pass = %d.\n", fcn_name, cp++, pass); @@ -3885,7 +4291,7 @@ vector_write_test_6(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer H5_FAILED(); - if (show_progress) { + if ((disp_failure_mssgs) || (show_progress)) { HDfprintf(stdout, "%s: failure_mssg = \"%s\"\n", fcn_name, failure_mssg); } } @@ -3893,7 +4299,7 @@ vector_write_test_6(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer return (!pass); -} /* vector_write_test_6() */ +} /* vector_write_test_7() */ /*------------------------------------------------------------------------- * Function: main @@ -3915,13 +4321,25 @@ vector_write_test_6(int file_name_id, int mpi_rank, int mpi_size, H5FD_mpio_xfer int main(int argc, char **argv) { - unsigned nerrs = 0; - MPI_Comm comm = MPI_COMM_WORLD; - MPI_Info info = MPI_INFO_NULL; + unsigned nerrs = 0; + MPI_Comm comm = MPI_COMM_WORLD; + MPI_Info info = MPI_INFO_NULL; + int required = MPI_THREAD_MULTIPLE; + int provided = 0; int mpi_size; int mpi_rank; +#if 0 /* JRM */ MPI_Init(&argc, &argv); +#else /* JRM */ + MPI_Init_thread(&argc, &argv, required, &provided); + + if (provided != required) { + + HDprintf(" MPI doesn't support MPI_Init_thread with MPI_THREAD_MULTIPLE. Exiting\n"); + goto finish; + } +#endif /* JRM */ MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); @@ -3955,9 +4373,18 @@ main(int argc, char **argv) HDprintf("\nAllocation and initialize of file image buffers failed. Test aborted.\n"); } +#if 1 /* JRM */ + /* sleep for a bit to allow GDB to attach to the process */ + // sleep(60); +#endif /* JRM */ + MPI_Barrier(MPI_COMM_WORLD); - // sleep(60); +#if 1 /* JRM */ /* skip MPIO VFD tests if desired. */ + if (mpi_rank == 0) { + + HDprintf("\n\n --- TESTING MPIO VFD --- \n\n"); + } nerrs += vector_read_test_1(0, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, "mpio"); @@ -4026,6 +4453,141 @@ main(int argc, char **argv) nerrs += vector_write_test_6(5, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, "mpio"); + nerrs += + vector_write_test_7(6, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, "mpio"); + nerrs += + vector_write_test_7(6, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, "mpio"); + nerrs += + vector_write_test_7(6, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, "mpio"); +#endif /* JRM */ + + MPI_Barrier(MPI_COMM_WORLD); + + if (mpi_rank == 0) { + + HDprintf("\n\n --- TESTING SUBFILING VFD --- \n\n"); + } + + nerrs += vector_read_test_1(7, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += + vector_read_test_1(7, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, "subfiling"); + // sleep(1); + nerrs += + vector_read_test_1(7, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, "subfiling"); + // sleep(1); + + nerrs += vector_read_test_2(8, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += + vector_read_test_2(8, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, "subfiling"); + // sleep(1); + nerrs += + vector_read_test_2(8, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, "subfiling"); + // sleep(1); + + nerrs += vector_read_test_3(9, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += + vector_read_test_3(9, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, "subfiling"); + // sleep(1); + nerrs += + vector_read_test_3(9, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, "subfiling"); + // sleep(1); + + nerrs += vector_read_test_4(10, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_read_test_4(10, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_read_test_4(10, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, + "subfiling"); + // sleep(1); + + nerrs += vector_read_test_5(11, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_read_test_5(11, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_read_test_5(11, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, + "subfiling"); + // sleep(1); + + nerrs += vector_write_test_1(7, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_1(7, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_1(7, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, + "subfiling"); + // sleep(1); + + nerrs += vector_write_test_2(8, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_2(8, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_2(8, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, + "subfiling"); + // sleep(1); + + nerrs += vector_write_test_3(9, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_3(9, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_3(9, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, + "subfiling"); + // sleep(1); + + nerrs += vector_write_test_4(10, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_4(10, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_4(10, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, + "subfiling"); + // sleep(1); + + nerrs += vector_write_test_5(11, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_5(11, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_5(11, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, + "subfiling"); + // sleep(1); + + nerrs += vector_write_test_6(12, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_6(12, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_6(12, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, + "subfiling"); + // sleep(1); + + nerrs += vector_write_test_7(13, mpi_rank, mpi_size, H5FD_MPIO_INDEPENDENT, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_7(13, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_INDIVIDUAL_IO, + "subfiling"); + // sleep(1); + nerrs += vector_write_test_7(13, mpi_rank, mpi_size, H5FD_MPIO_COLLECTIVE, H5FD_MPIO_COLLECTIVE_IO, + "subfiling"); + // sleep(1); + finish: /* make sure all processes are finished before final report, cleanup diff --git a/tools/src/h5ls/h5ls.c b/tools/src/h5ls/h5ls.c index 0d345745940..02b41688ce5 100644 --- a/tools/src/h5ls/h5ls.c +++ b/tools/src/h5ls/h5ls.c @@ -2526,7 +2526,7 @@ visit_obj(hid_t file, const char *oname, iter_t *iter) * were borrowed from the GNU less(1). * * Return: Success: Number of columns. - * Failure: Some default number of columms. + * Failure: Some default number of columns. *------------------------------------------------------------------------- */ static int diff --git a/tools/src/h5stat/h5stat.c b/tools/src/h5stat/h5stat.c index f1156f02712..d2773569ec6 100644 --- a/tools/src/h5stat/h5stat.c +++ b/tools/src/h5stat/h5stat.c @@ -1736,7 +1736,7 @@ main(int argc, char *argv[]) warn_msg("Unable to retrieve file size\n"); HDassert(iter.filesize != 0); - /* Get storge info for file-level structures */ + /* Get storage info for file-level structures */ if (H5Fget_info2(fid, &finfo) < 0) warn_msg("Unable to retrieve file info\n"); else { diff --git a/tools/test/h5copy/h5copygentest.c b/tools/test/h5copy/h5copygentest.c index c1f8349470b..e415c97f054 100644 --- a/tools/test/h5copy/h5copygentest.c +++ b/tools/test/h5copy/h5copygentest.c @@ -923,7 +923,7 @@ gen_extlink_src(hid_t loc_id) /*------------------------------------------------------------------------- * Function: Test_Extlink_Copy * - * Purpose: gerenate external link files + * Purpose: generate external link files * *------------------------------------------------------------------------*/ static void diff --git a/utils/subfiling_vfd/h5fuse.sh b/utils/subfiling_vfd/h5fuse.sh new file mode 100755 index 00000000000..817058601ca --- /dev/null +++ b/utils/subfiling_vfd/h5fuse.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# +# Copyright by The HDF Group. +# All rights reserved. +# +# This file is part of HDF5. The full HDF5 copyright notice, including +# terms governing use, modification, and redistribution, is contained in +# the COPYING file, which can be found at the root of the source code +# distribution tree, or in https://www.hdfgroup.org/licenses. +# If you do not have access to either file, you may request a copy from +# help@hdfgroup.org. +# + +# Purpose: Combine subfiles into a single HDF5 file. Requires the subfiling +# configuration file either as a command-line argument, or the script will +# search for the *.config file in the current directory. + +GRN='\033[0;32m' +RED='\033[0;31m' +PUR='\033[0;35m' +NC='\033[0m' # No Color + +## CONFIG FILE CHECKS ## + +if [ $# -eq 0 ]; then + nfiles=$(find . -maxdepth 1 -type f -iname "*.config" -printf '.' | wc -m) + if [[ "$nfiles" != "1" ]]; then + echo -e "$RED More than one .config file found in current directory. $NC" + exit 1 + fi + file_config=$(find . -maxdepth 1 -type f -iname "*.config") +else + file_config=$1 +fi + +if [ ! -f "$file_config" ]; then + echo -e "$RED $file_config does not exist. $NC" + exit 1 +fi + +stripe_size=$(grep "stripe_size=" $file_config | cut -d "=" -f2) +if test -z "$stripe_size"; then + echo -e "$RED failed to find stripe_size in $file_config $NC" + exit 1 +fi + +subfiles=( $( sed -e '1,/hdf5_file=/d' $file_config ) ) +#for i in "${subfiles[@]}"; do +# echo "$i" +#done +if test -z "$subfiles"; then + echo -e "$RED failed to find subfiles list in $file_config $NC" + exit 1 +fi + +hdf5_file=$(grep "hdf5_file=" $file_config | cut -d "=" -f2) +if test -z "$hdf5_file"; then + echo -e "$RED failed to find hdf5 output file in $file_config $NC" + exit 1 +fi + +rm -f $hdf5_file + +## COMBINE SUBFILES INTO AN HDF5 FILE ## + +skip=0 +status=$nfiles +START="$(date +%s%N)" +while [ $status -gt 0 ]; do + icnt=0 + for i in "${subfiles[@]}"; do + fsize=$(wc -c $i | awk '{print $1}') + if [ $(($skip*$stripe_size)) -le $fsize ]; then + EXEC="dd count=1 bs=$stripe_size if=$i of=$hdf5_file skip=$skip oflag=append conv=notrunc" + echo -e "$GRN $EXEC $NC" + err="$( $EXEC 2>&1 > /dev/null &)" + icnt=$(($icnt+1)) + else + subfiles=("${subfiles[@]:0:$icnt}" "${subfiles[@]:$(($icnt+1))}") + status=${#subfiles[@]} + fi + done; wait + skip=$(($skip+1)) +done +END=$[ $(date +%s%N) - ${START} ] +DURATION_SEC=$(awk -vp=$END -vq=0.000000001 'BEGIN{printf "%.4f" ,p * q}') +echo -e "$PUR COMPLETION TIME = $DURATION_SEC s $NC" + +