From c07bcfb13738b3c22d182b9350d70aeeee7148b9 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Sat, 24 Jul 2021 00:40:41 +0000 Subject: [PATCH 01/44] Add sycl configure option and memory functionality This is an initial commit that still needs some reworking and debugging. --- src/config/HYPRE_config.h.in | 3 + src/config/Makefile.config.in | 10 +- src/config/configure.in | 117 ++++++++++++++++++++++++ src/configure | 141 ++++++++++++++++++++++++++++- src/test/Makefile | 5 + src/utilities/Makefile | 5 +- src/utilities/_hypre_utilities.h | 10 ++ src/utilities/_hypre_utilities.hpp | 75 +++++++++++++++ src/utilities/general.c | 112 ++++++++++++++++++++++- src/utilities/handle.h | 10 ++ src/utilities/headers | 1 + src/utilities/memory.c | 95 ++++++++++++++++++- 12 files changed, 574 insertions(+), 10 deletions(-) diff --git a/src/config/HYPRE_config.h.in b/src/config/HYPRE_config.h.in index 98425a6545..bcbeef6853 100644 --- a/src/config/HYPRE_config.h.in +++ b/src/config/HYPRE_config.h.in @@ -187,6 +187,9 @@ /* HIP being used */ #undef HYPRE_USING_HIP +/* SYCL being used */ +#undef HYPRE_USING_SYCL + /* Define to 1 if using host memory only */ #undef HYPRE_USING_HOST_MEMORY diff --git a/src/config/Makefile.config.in b/src/config/Makefile.config.in index a99223c346..cf605d1c0f 100644 --- a/src/config/Makefile.config.in +++ b/src/config/Makefile.config.in @@ -87,10 +87,10 @@ AR = @AR@ RANLIB = @RANLIB@ LDFLAGS = @LDFLAGS@ -LIBS = @LIBS@ ${CALIPER_LIBS} ${HYPRE_CUDA_LIBS} ${HYPRE_HIP_LIBS} ${HYPRE_RAJA_LIB_DIR} ${HYPRE_RAJA_LIB} ${HYPRE_KOKKOS_LIB_DIR} ${HYPRE_KOKKOS_LIB} ${HYPRE_UMPIRE_LIB_DIR} ${HYPRE_UMPIRE_LIB} +LIBS = @LIBS@ ${CALIPER_LIBS} ${HYPRE_CUDA_LIBS} ${HYPRE_HIP_LIBS} ${HYPRE_SYCL_LIBS} ${HYPRE_RAJA_LIB_DIR} ${HYPRE_RAJA_LIB} ${HYPRE_KOKKOS_LIB_DIR} ${HYPRE_KOKKOS_LIB} ${HYPRE_UMPIRE_LIB_DIR} ${HYPRE_UMPIRE_LIB} FLIBS = @FLIBS@ -INCLUDES = ${CALIPER_INCLUDE} ${HYPRE_CUDA_INCLUDE} ${HYPRE_HIP_INCLUDE} ${HYPRE_RAJA_INCLUDE} ${HYPRE_KOKKOS_INCLUDE} ${HYPRE_UMPIRE_INCLUDE} ${HYPRE_NAP_INCLUDE} +INCLUDES = ${CALIPER_INCLUDE} ${HYPRE_CUDA_INCLUDE} ${HYPRE_HIP_INCLUDE} ${HYPRE_SYCL_INCLUDE} ${HYPRE_RAJA_INCLUDE} ${HYPRE_KOKKOS_INCLUDE} ${HYPRE_UMPIRE_INCLUDE} ${HYPRE_NAP_INCLUDE} ################################################################## ## LAPACK Library Flags @@ -131,6 +131,12 @@ CUDA_ARCH = @HYPRE_CUDA_GENCODE@ HYPRE_HIP_INCLUDE = @HYPRE_HIP_INCL@ HYPRE_HIP_LIBS = @HYPRE_HIP_LIBS@ +################################################################## +## SYCL options +################################################################## +HYPRE_SYCL_INCLUDE=@HYPRE_SYCL_INCL@ +HYPRE_SYCL_LIBS=@HYPRE_SYCL_LIBS@ + ################################################################## ## Caliper options ################################################################## diff --git a/src/config/configure.in b/src/config/configure.in index d54ac936d8..3f7acd524f 100644 --- a/src/config/configure.in +++ b/src/config/configure.in @@ -182,6 +182,16 @@ hypre_using_rocrand=no hypre_found_hip=no +dnl ********************************************************************* +dnl * Initialize hypre-SYCL variables +dnl ********************************************************************* +hypre_using_sycl=no +hypre_using_onemklsparse=no +hypre_using_onemklblas=no +hypre_using_onemklrand=no + +hypre_found_sycl=no + dnl ********************************************************************* dnl * Initialize flag-check variables @@ -1137,6 +1147,19 @@ AS_HELP_STRING([--with-hip], [hypre_using_hip=no] ) +dnl ***** SYCL +AC_ARG_WITH(sycl, +AS_HELP_STRING([--with-sycl], + [Use SYCL for Intel GPUs. (default is NO).]), +[case "$withval" in + yes) hypre_using_sycl=yes ;; + no) hypre_using_sycl=no ;; + *) hypre_using_sycl=no ;; +esac], +[hypre_using_sycl=no] +) + + AC_ARG_WITH(cuda-home, AS_HELP_STRING([--with-cuda-home=DIR], [User specifies CUDA_HOME in DIR.]), @@ -1977,7 +2000,26 @@ AS_IF([ test x"$hypre_using_hip" == x"yes" ], [AC_MSG_ERROR([unable to find ${HYPRE_ROCM_PREFIX}/include/hip/hip_common.h ... Ensure ROCm is installed and set ROCM_PATH environment variable to ROCm installation path.])] )], []) +dnl ********************************************************************* +dnl * Check for SYCL header +dnl ********************************************************************* +dnl If the user has requested to use SYCL, we first check the environment +dnl for ONEAPI_PATH to point at the oneAPI installation. If that is not found, +dnl then we default to `/opt/intel/oneapi`. +dnl +dnl TODO: Add an ARG_WITH for sycl so the user can control the oneAPI path +dnl through the configure line +AS_IF([ test x"$hypre_using_sycl" == x"yes" ], + [ AS_IF([ test -n "$ONEAPI_PATH"], + [ HYPRE_SYCL_PREFIX=$ONEAPI_PATH ], + [ HYPRE_SYCL_PREFIX=/opt/intel/oneapi ]) + + AC_SUBST(HYPRE_SYCL_PREFIX) + AC_CHECK_HEADERS( ["${HYPRE_SYCL_PREFIX}/compiler/latest/linux/include/sycl/CL/sycl.hpp"], + [hypre_found_sycl=yes], + [AC_MSG_ERROR([unable to find ${HYPRE_SYCL_PREFIX}/compiler/latest/linux/include/sycl/CL/sycl.hpp ... Ensure oneAPI SDK is installed and set ONEAPI_PATH environment variable to oneAPI installation path.])] )], + []) dnl ********************************************************************* dnl * Set raja options @@ -2241,6 +2283,67 @@ AS_IF([test x"$hypre_using_hip" == x"yes"], ]) dnl AS_IF([test x"$hypre_using_hip" == x"yes"] +dnl ********************************************************************* +dnl * Set SYCL options +dnl ********************************************************************* +AS_IF([test x"$hypre_user_chose_sycl" == x"yes"], + [ + AC_DEFINE(HYPRE_USING_GPU, 1, [Define to 1 if executing on GPU device]) + AC_DEFINE(HYPRE_USING_SYCL, 1, [SYCL being used]) + + dnl The actual invocation of the clang compiler from oneAPI that + dnl supports SYCL and all the command line foo needed by the compiler. + AC_CHECK_PROGS(CXX, [dpcpp]) + + dnl (Ab)Using dpcpp when compiling SYCL + LINK_CC=${CXX} + LINK_CXX=${CXX} + + dnl The "-x sycl" is necessary to override the detection of .c files which clang + dnl interprets as C and therefore invokes the C compiler rather than the SYCL part + dnl of clang. Put SYCLCXXFLAGS at the end so the user can override from + dnl from the configure line. + SYCLCXXFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel " + + dnl If not in debug mode, at least -O2, but the user can override with + dnl with SYCLCXXFLAGS on the configure line. If in debug mode, -O0 -Wall + dnl plus flags for debugging symbols + AS_IF([test x"$hypre_using_debug" == x"yes"], + [SYCLCXXFLAGS="-O0 -Wall -g -gdb ${SYCLCXXFLAGS}"], + [SYCLCXXFLAGS="-O2 ${SYCLCXXFLAGS}"],) + + dnl (Ab)Use CXXFLAGS to capture SYCL compilation flags + dnl Put SYCLCXXFLAGS at the end so the user can override the optimization level. + CXXFLAGS="${SYCLCXXFLAGS}" + + dnl dpl, dpct so we need both for Thrust on Intel GPUs. + dnl These are header-only so no linking needed. + HYPRE_SYCL_INCL="-I${ONEAPI_PATH}/dpl/latest/linux/include" + HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${ONEAPI_PATH}/dpcpp-ct/latest/include" + + dnl SYCL library + HYPRE_SYCL_LIBS="-L${HYPRE_SYCL_PREFIX}/lib -lamdsycl64" + + AS_IF([test x"$hypre_using_onemklsparse" == x"yes"], + [AC_DEFINE(HYPRE_USING_ONEMKLSPARSE, 1, [onemkl::SPARSE being used]) + HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" + HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/mkl/spblas.hpp" + ]) + + AS_IF([test x"$hypre_using_onemklblas" == x"yes"], + [AC_DEFINE(HYPRE_USING_ONEMKLBLAS, 1, [onemkl::BLAS being used]) + HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" + HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/blas.hpp" + ]) + + dnl onemklrand: random number generation on Intel GPUs + AS_IF([test x"$hypre_using_onemklrand" == x"yes"], + [AC_DEFINE(HYPRE_USING_ONEMKLRAND, 1, [onemkl::rng being used]) + HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" + HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/rng.hpp" + ]) + + ]) dnl AS_IF([test x"$hypre_user_chose_sycl" == x"yes"] dnl ********************************************************************* @@ -2265,6 +2368,14 @@ then AC_MSG_NOTICE([Use --enable-unified-memory to compile with unified memory.]) AC_MSG_NOTICE([***********************************************************************]) fi + if test "$hypre_user_chose_sycl" = "yes" + then + AC_MSG_NOTICE([***********************************************************]) + AC_MSG_NOTICE([Configuring with --with-sycl=yes without unified memory.]) + AC_MSG_NOTICE([It only works for struct interface.]) + AC_MSG_NOTICE([Use --enable-unified-memory to compile with unified memory.]) + AC_MSG_NOTICE([***********************************************************]) + fi if test "$hypre_using_device_openmp" = "yes" then AC_MSG_NOTICE([***********************************************************************]) @@ -2512,6 +2623,12 @@ dnl ********************************************************************* AC_SUBST(HYPRE_HIP_INCL) AC_SUBST(HYPRE_HIP_LIBS) +dnl ********************************************************************* +dnl * SYCL stuff +dnl ********************************************************************* +AC_SUBST(HYPRE_SYCL_INCL) +AC_SUBST(HYPRE_SYCL_LIBS) + dnl ********************************************************************* dnl * Caliper instrumentation dnl ********************************************************************* diff --git a/src/configure b/src/configure index 62b3f3fb26..cc2fee9bd2 100755 --- a/src/configure +++ b/src/configure @@ -633,6 +633,8 @@ SUPERLU_LIBS SUPERLU_INCLUDE CALIPER_LIBS CALIPER_INCLUDE +HYPRE_SYCL_LIBS +HYPRE_SYCL_INCL HYPRE_HIP_LIBS HYPRE_HIP_INCL HYPRE_CUDA_LIBS @@ -830,6 +832,7 @@ with_mli with_MPI with_cuda with_hip +with_sycl with_cuda_home with_gpu_arch with_raja @@ -1635,6 +1638,7 @@ Optional Packages: --with-cuda Use CUDA. Require cuda-8.0 or higher (default is NO). --with-hip Use HIP for AMD GPUs. (default is NO). + --with-sycl Use SYCL for Intel GPUs. (default is NO). --with-cuda-home=DIR User specifies CUDA_HOME in DIR. --with-gpu-arch=ARG User specifies NVIDIA GPU architecture that the CUDA files will be compiled for in ARG, where ARG is a @@ -2750,6 +2754,13 @@ hypre_using_rocrand=no hypre_found_hip=no +hypre_using_sycl=no +hypre_using_onemklsparse=no +hypre_using_onemklblas=no +hypre_using_onemklrand=no + +hypre_found_sycl=no + hypre_blas_lib_old_style=no hypre_blas_lib_dir_old_style=no @@ -3953,6 +3964,22 @@ fi + +# Check whether --with-sycl was given. +if test "${with_sycl+set}" = set; then : + withval=$with_sycl; case "$withval" in +yes) hypre_using_sycl=yes ;; +no) hypre_using_sycl=no ;; +*) hypre_using_sycl=no ;; +esac +else + hypre_using_sycl=no + +fi + + + + # Check whether --with-cuda-home was given. if test "${with_cuda_home+set}" = set; then : withval=$with_cuda_home; for cuda_dir in $withval; do @@ -8536,7 +8563,7 @@ fi if test "x$hypre_using_um" = "xyes" then - if test "x$hypre_using_cuda" != "xyes" && test "x$hypre_using_device_openmp" != "xyes" && test "x$hypre_using_hip" != "xyes" + if test "x$hypre_using_cuda" != "xyes" && test "x$hypre_using_device_openmp" != "xyes" && test "x$hypre_using_hip" != "xyes" && test "x$hypre_using_sycl" != "xyes" then as_fn_error $? "Asked for unified memory, but not using CUDA, HIP, or device OpenMP!" "$LINENO" 5 fi @@ -9010,6 +9037,105 @@ fi fi +if test x"$hypre_using_sycl" == x"yes"; then : + +# WM: not setting this with sycl for now since it is giving me problems +$as_echo "#define HYPRE_USING_GPU 1" >>confdefs.h + +$as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h + + + for ac_prog in dpcpp +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CUCC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CUCC"; then + ac_cv_prog_CUCC="$CUCC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CUCC="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CUCC=$ac_cv_prog_CUCC +if test -n "$CUCC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUCC" >&5 +$as_echo "$CUCC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CUCC" && break +done + + + LINK_CC=${CUCC} + LINK_CXX=${CUCC} + + SYCLCXXFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel " + + if test x"$hypre_using_debug" == x"yes"; then : + SYCLCXXFLAGS="-O0 -Wall -g -gdbx ${SYCLCXXFLAGS}" +elif SYCLCXXFLAGS="-O2 ${SYCLCXXFLAGS}"; then : + +fi + + CUFLAGS="${SYCLCXXFLAGS}" + + HYPRE_SYCL_INCL="-I${ONEAPI_PATH}/dpl/latest/linux/include" + HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${ONEAPI_PATH}/dpcpp-ct/latest/include" + + + if test x"$hypre_using_onemklsparse" == x"yes"; then : + +$as_echo "#define HYPRE_USING_ONEMKLSPARSE 1" >>confdefs.h + + HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" + HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/mkl/spblas.hpp" + +fi + + if test x"$hypre_using_onemklblas" == x"yes"; then : + +$as_echo "#define HYPRE_USING_ONEMKLBLAS 1" >>confdefs.h + + HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" + HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/blas.hpp" + +fi + + if test x"$hypre_using_onemklrand" == x"yes"; then : + +$as_echo "#define HYPRE_USING_ONEMKLRAND 1" >>confdefs.h + + HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" + HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/rng.hpp" + +fi + + +fi + + if test "$hypre_using_um" != "yes" then @@ -9038,6 +9164,19 @@ $as_echo "$as_me: It only works for structured solvers and selected unstructured $as_echo "$as_me: Use --enable-unified-memory to compile with unified memory." >&6;} { $as_echo "$as_me:${as_lineno-$LINENO}: ***********************************************************************" >&5 $as_echo "$as_me: ***********************************************************************" >&6;} + fi + if test "$hypre_using_sycl" = "yes" + then + { $as_echo "$as_me:${as_lineno-$LINENO}: ***********************************************************" >&5 +$as_echo "$as_me: ***********************************************************" >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: Configuring with --with-sycl=yes without unified memory." >&5 +$as_echo "$as_me: Configuring with --with-sycl=yes without unified memory." >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: It only works for struct interface." >&5 +$as_echo "$as_me: It only works for struct interface." >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: Use --enable-unified-memory to compile with unified memory." >&5 +$as_echo "$as_me: Use --enable-unified-memory to compile with unified memory." >&6;} + { $as_echo "$as_me:${as_lineno-$LINENO}: ***********************************************************" >&5 +$as_echo "$as_me: ***********************************************************" >&6;} fi if test "$hypre_using_device_openmp" = "yes" then diff --git a/src/test/Makefile b/src/test/Makefile index 8922d0ab2d..10c3ac32cf 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -64,6 +64,7 @@ LFLAGS =\ HYPRE_DRIVERS =\ ij.c\ + simple.c\ ij_assembly.c\ sstruct.c\ struct.c\ @@ -143,6 +144,10 @@ ij: ij.${OBJ_SUFFIX} @echo "Building" $@ "... " ${LINK_CC} -o $@ $< ${LFLAGS} +simple: simple.${OBJ_SUFFIX} + @echo "Building" $@ "... " + ${LINK_CC} -o $@ $< ${LFLAGS} + ij_assembly: ij_assembly.${OBJ_SUFFIX} @echo "Building" $@ "... " ${LINK_CC} -o $@ $< ${LFLAGS} diff --git a/src/utilities/Makefile b/src/utilities/Makefile index 8281d38f33..4b37e1cd3e 100644 --- a/src/utilities/Makefile +++ b/src/utilities/Makefile @@ -62,8 +62,9 @@ CUFILES=\ general.c\ handle.c\ memory.c\ - omp_device.c \ - nvtx.c + omp_device.c\ + nvtx.c\ + sycl_utils.c COBJS = ${FILES:.c=.o} CUOBJS = ${CUFILES:.c=.obj} diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index c2a56322d2..503c13f2d3 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -1209,6 +1209,9 @@ static char hypre__levelname[16]; struct hypre_CudaData; typedef struct hypre_CudaData hypre_CudaData; +struct hypre_SyclData; +typedef struct hypre_SyclData hypre_SyclData; + typedef struct { HYPRE_Int hypre_error; @@ -1234,6 +1237,9 @@ typedef struct HYPRE_Int own_umpire_pinned_pool; umpire_resourcemanager umpire_rm; #endif +#if defined(HYPRE_USING_SYCL) + hypre_SyclData *sycl_data; +#endif } hypre_Handle; /* accessor macros to hypre_Handle */ @@ -1241,6 +1247,7 @@ typedef struct #define hypre_HandleDefaultExecPolicy(hypre_handle) ((hypre_handle) -> default_exec_policy) #define hypre_HandleStructExecPolicy(hypre_handle) ((hypre_handle) -> struct_exec_policy) #define hypre_HandleCudaData(hypre_handle) ((hypre_handle) -> cuda_data) +#define hypre_HandleSyclData(hypre_handle) ((hypre_handle) -> sycl_data) #define hypre_HandleCurandGenerator(hypre_handle) hypre_CudaDataCurandGenerator(hypre_HandleCudaData(hypre_handle)) #define hypre_HandleCublasHandle(hypre_handle) hypre_CudaDataCublasHandle(hypre_HandleCudaData(hypre_handle)) @@ -1283,6 +1290,9 @@ typedef struct #define hypre_HandleOwnUmpireHostPool(hypre_handle) ((hypre_handle) -> own_umpire_host_pool) #define hypre_HandleOwnUmpirePinnedPool(hypre_handle) ((hypre_handle) -> own_umpire_pinned_pool) +#define hypre_HandleSyclComputeQueue(hypre_handle) hypre_SyclDataSyclComputeQueue(hypre_HandleSyclData(hypre_handle)) +#define hypre_HandleSyclDevice(hypre_handle) hypre_SyclDataSyclDevice(hypre_HandleSyclData(hypre_handle)) +#define hypre_HandleSyclComputeQueueNum(hypre_handle) hypre_SyclDataSyclComputeQueueNum(hypre_HandleSyclData(hypre_handle)) #endif /****************************************************************************** * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 75fe8ecb02..4d4eed5220 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -2111,6 +2111,81 @@ struct hypre_cub_CachingDeviceAllocator #endif // #if defined(HYPRE_USING_CUDA) && defined(HYPRE_USING_DEVICE_POOL) #endif // #ifndef HYPRE_CUB_ALLOCATOR_HEADER +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +#ifndef HYPRE_SYCL_UTILS_HPP +#define HYPRE_SYCL_UTILS_HPP + +#if defined(HYPRE_USING_SYCL) + +/* #include */ +/* #include */ +/* #include */ +/* #include */ + +/* #include // dpct::remove_if, remove_copy_if, copy_if */ + +/* #include */ +/* #include */ +/* #include */ +/* #include */ + +#include +/* #include */ +/* #include */ + +#define HYPRE_SYCL_CALL( EXPR ) \ + try \ + { \ + EXPR; \ + } \ + catch (sycl::exception const &ex) \ + { \ + hypre_printf("SYCL ERROR (code = %s) at %s:%d\n", ex.what(), \ + __FILE__, __LINE__); \ + assert(0); exit(1); \ + } \ + catch(std::runtime_error const& ex) { \ + hypre_printf("STD ERROR (code = %s) at %s:%d\n", ex.what(), \ + __FILE__, __LINE__); \ + assert(0); exit(1); \ + } + +// HYPRE_SUBGROUP_BITSHIFT is just log2 of HYPRE_SUBGROUP_SIZE +#define HYPRE_SUBGROUP_SIZE 32 +#define HYPRE_SUBGROUP_BITSHIFT 5 +#define HYPRE_MAX_NUM_SUBGROUPS (64 * 64 * 32) +#define HYPRE_FLT_LARGE 1e30 +#define HYPRE_1D_BLOCK_SIZE 512 +#define HYPRE_MAX_NUM_QUEUES 10 + +struct hypre_SyclData +{ + sycl::queue* sycl_queues[HYPRE_MAX_NUM_QUEUES] = {}; + sycl::device sycl_device; + + /* by default, hypre puts GPU computations in this queue + * Do not be confused with the default (null) SYCL queue */ + HYPRE_Int sycl_compute_queue_num; +}; + +#define hypre_SyclDataSyclDevice(data) ((data) -> sycl_device) +#define hypre_SyclDataSyclComputeQueueNum(data) ((data) -> sycl_compute_queue_num) + +hypre_SyclData* hypre_SyclDataCreate(); +void hypre_SyclDataDestroy(hypre_SyclData* data); + +sycl::queue *hypre_SyclDataSyclQueue(hypre_SyclData *data, HYPRE_Int i); +sycl::queue *hypre_SyclDataSyclComputeQueue(hypre_SyclData *data); + +#endif // #if defined(HYPRE_USING_SYCL) + +#endif /* #ifndef HYPRE_SYCL_UTILS_HPP */ #ifdef __cplusplus } diff --git a/src/utilities/general.c b/src/utilities/general.c index f52820f5e6..11d747afad 100644 --- a/src/utilities/general.c +++ b/src/utilities/general.c @@ -54,6 +54,12 @@ hypre_HandleCreate() hypre_HandleCudaData(hypre_handle_) = hypre_CudaDataCreate(); #endif +#if defined(HYPRE_USING_SYCL) + hypre_HandleDefaultExecPolicy(hypre_handle_) = HYPRE_EXEC_HOST; + hypre_HandleStructExecPolicy(hypre_handle_) = HYPRE_EXEC_HOST; + hypre_HandleSyclData(hypre_handle_) = hypre_SyclDataCreate(); +#endif + return hypre_handle_; } @@ -69,6 +75,10 @@ hypre_HandleDestroy(hypre_Handle *hypre_handle_) hypre_CudaDataDestroy(hypre_HandleCudaData(hypre_handle_)); #endif +#if defined(HYPRE_USING_SYCL) + hypre_SyclDataDestroy(hypre_HandleSyclData(hypre_handle_)); +#endif + hypre_TFree(hypre_handle_, HYPRE_MEMORY_HOST); return hypre_error_flag; @@ -98,6 +108,67 @@ hypre_SetDevice(hypre_int device_id, hypre_Handle *hypre_handle_) } #endif +#if defined(HYPRE_USING_SYCL) + // WM: TODO - this ain't it... + hypre_int nDevices=0; + sycl::platform platform(sycl::gpu_selector{}); + auto const& gpu_devices = platform.get_devices(); + for (int i = 0; i < gpu_devices.size(); i++) + { + if (gpu_devices[i].is_gpu()) + { + if(gpu_devices[i].get_info() > 0) + { + auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices( + sycl::info::partition_affinity_domain::numa); + nDevices += subDevicesDomainNuma.size(); + } + else + { + nDevices++; + } + } + } + + if (device_id > nDevices) + { + // WM: debug + hypre_printf("device_id = %d, nDevices = %d\n", device_id, nDevices); + hypre_printf("ERROR: SYCL device-ID exceed the number of devices on-node... \n"); + } + + HYPRE_Int local_nDevices=0; + for (int i = 0; i < gpu_devices.size(); i++) + { + if (gpu_devices[i].is_gpu()) + { + // multi-tile GPUs + if (gpu_devices[i].get_info() > 0) + { + auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices( + sycl::info::partition_affinity_domain::numa); + for (const auto &tile : subDevicesDomainNuma) + { + if (local_nDevices == device_id) + { + hypre_HandleSyclDevice(hypre_handle_) = tile; + } + local_nDevices++; + } + } + // single-tile GPUs + else + { + if (local_nDevices == device_id) + { + hypre_HandleSyclDevice(hypre_handle_) = gpu_devices[i]; + } + local_nDevices++; + } + } + } +#endif + return hypre_error_flag; } @@ -119,6 +190,10 @@ hypre_GetDevice(hypre_int *device_id) HYPRE_HIP_CALL( hipGetDevice(device_id) ); #endif +#if defined(HYPRE_USING_SYCL) + // WM: TODO +#endif + return hypre_error_flag; } @@ -137,6 +212,28 @@ hypre_GetDeviceCount(hypre_int *device_count) HYPRE_HIP_CALL( hipGetDeviceCount(device_count) ); #endif +#if defined(HYPRE_USING_SYCL) + // WM: TODO - verify + sycl::platform platform(sycl::gpu_selector{}); + auto const& gpu_devices = platform.get_devices(); + for (int i = 0; i < gpu_devices.size(); i++) + { + if (gpu_devices[i].is_gpu()) + { + if(gpu_devices[i].get_info() > 0) + { + auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices( + sycl::info::partition_affinity_domain::numa); + (*device_count) += subDevicesDomainNuma.size(); + } + else + { + (*device_count)++; + } + } + } +#endif + return hypre_error_flag; } @@ -155,6 +252,10 @@ hypre_GetDeviceLastError() HYPRE_HIP_CALL( hipGetLastError() ); #endif +#if defined(HYPRE_USING_SYCL) + // WM: TODO +#endif + return hypre_error_flag; } @@ -179,7 +280,7 @@ HYPRE_Init() _hypre_handle = hypre_HandleCreate(); } -#if defined(HYPRE_USING_GPU) +#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL) hypre_GetDeviceLastError(); /* Notice: the cudaStream created is specific to the device @@ -192,7 +293,12 @@ HYPRE_Init() /* To include the cost of creating streams/cudahandles in HYPRE_Init */ /* If not here, will be done at the first use */ +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_HandleCudaComputeStream(_hypre_handle); +#endif +#if defined(HYPRE_USING_SYCL) + hypre_HandleSyclComputeQueue(_hypre_handle); +#endif /* A separate stream for prefetching */ //hypre_HandleCudaPrefetchStream(_hypre_handle); @@ -298,6 +404,10 @@ HYPRE_PrintDeviceInfo() hypre_printf("Running on \"%s\", major %d, minor %d, total memory %.2f GB\n", deviceProp.name, deviceProp.major, deviceProp.minor, deviceProp.totalGlobalMem/1e9); #endif +#if defined(HYPRE_USING_SYCL) + // WM: TODO +#endif + return hypre_error_flag; } diff --git a/src/utilities/handle.h b/src/utilities/handle.h index 2055073e8e..2c2bccfcc8 100644 --- a/src/utilities/handle.h +++ b/src/utilities/handle.h @@ -17,6 +17,9 @@ struct hypre_CudaData; typedef struct hypre_CudaData hypre_CudaData; +struct hypre_SyclData; +typedef struct hypre_SyclData hypre_SyclData; + typedef struct { HYPRE_Int hypre_error; @@ -42,6 +45,9 @@ typedef struct HYPRE_Int own_umpire_pinned_pool; umpire_resourcemanager umpire_rm; #endif +#if defined(HYPRE_USING_SYCL) + hypre_SyclData *sycl_data; +#endif } hypre_Handle; /* accessor macros to hypre_Handle */ @@ -49,6 +55,7 @@ typedef struct #define hypre_HandleDefaultExecPolicy(hypre_handle) ((hypre_handle) -> default_exec_policy) #define hypre_HandleStructExecPolicy(hypre_handle) ((hypre_handle) -> struct_exec_policy) #define hypre_HandleCudaData(hypre_handle) ((hypre_handle) -> cuda_data) +#define hypre_HandleSyclData(hypre_handle) ((hypre_handle) -> sycl_data) #define hypre_HandleCurandGenerator(hypre_handle) hypre_CudaDataCurandGenerator(hypre_HandleCudaData(hypre_handle)) #define hypre_HandleCublasHandle(hypre_handle) hypre_CudaDataCublasHandle(hypre_HandleCudaData(hypre_handle)) @@ -91,4 +98,7 @@ typedef struct #define hypre_HandleOwnUmpireHostPool(hypre_handle) ((hypre_handle) -> own_umpire_host_pool) #define hypre_HandleOwnUmpirePinnedPool(hypre_handle) ((hypre_handle) -> own_umpire_pinned_pool) +#define hypre_HandleSyclComputeQueue(hypre_handle) hypre_SyclDataSyclComputeQueue(hypre_HandleSyclData(hypre_handle)) +#define hypre_HandleSyclDevice(hypre_handle) hypre_SyclDataSyclDevice(hypre_HandleSyclData(hypre_handle)) +#define hypre_HandleSyclComputeQueueNum(hypre_handle) hypre_SyclDataSyclComputeQueueNum(hypre_HandleSyclData(hypre_handle)) #endif diff --git a/src/utilities/headers b/src/utilities/headers index cf74476aa0..0c96b33fae 100755 --- a/src/utilities/headers +++ b/src/utilities/headers @@ -92,6 +92,7 @@ cat umpire_allocator.h >> $INTERNAL_HEADER cat cuda_utils.h >> $INTERNAL_HEADER cat cuda_reducer.h >> $INTERNAL_HEADER cat cub_allocator.h >> $INTERNAL_HEADER +cat sycl_utils.h >> $INTERNAL_HEADER #=========================================================================== # Include guards diff --git a/src/utilities/memory.c b/src/utilities/memory.c index 41005fe8dd..5dc5af7ea8 100644 --- a/src/utilities/memory.c +++ b/src/utilities/memory.c @@ -76,6 +76,10 @@ hypre_DeviceMemset(void *ptr, HYPRE_Int value, size_t num) #if defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipMemset(ptr, value, num) ); #endif + +#if defined(HYPRE_USING_SYCL) + (hypre_HandleSyclComputeQueue(hypre_handle()))->memset(ptr, value, num).wait(); +#endif } static inline void @@ -93,6 +97,10 @@ hypre_UnifiedMemset(void *ptr, HYPRE_Int value, size_t num) #if defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipMemset(ptr, value, num) ); #endif + +#if defined(HYPRE_USING_SYCL) + (hypre_HandleSyclComputeQueue(hypre_handle()))->memset(ptr, value, num).wait(); +#endif } /*-------------------------------------------------------------------------- @@ -152,6 +160,10 @@ hypre_UnifiedMemPrefetch(void *ptr, size_t size, hypre_MemoryLocation location) *} */ #endif + +#if defined(HYPRE_USING_SYCL) + // WM: TODO +#endif } /*-------------------------------------------------------------------------- @@ -215,6 +227,10 @@ hypre_DeviceMalloc(size_t size, HYPRE_Int zeroinit) HYPRE_HIP_CALL( hipMalloc(&ptr, size) ); #endif +#if defined(HYPRE_USING_SYCL) + ptr = (void *)sycl::malloc_device(size, *(hypre_HandleSyclComputeQueue(hypre_handle()))); +#endif + #endif /* #if defined(HYPRE_USING_UMPIRE_DEVICE) */ if (ptr && zeroinit) @@ -250,6 +266,10 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit) HYPRE_HIP_CALL( hipMallocManaged(&ptr, size, hipMemAttachGlobal) ); #endif +#if defined(HYPRE_USING_SYCL) + ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleSyclComputeQueue(hypre_handle()))); +#endif + #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */ /* prefecth to device */ @@ -287,6 +307,10 @@ hypre_HostPinnedMalloc(size_t size, HYPRE_Int zeroinit) HYPRE_HIP_CALL( hipHostMalloc(&ptr, size) ); #endif +#if defined(HYPRE_USING_SYCL) + ptr = (void *)sycl::malloc_host(size, *(hypre_HandleSyclComputeQueue(hypre_handle()))); +#endif + #endif /* #if defined(HYPRE_USING_UMPIRE_PINNED) */ if (ptr && zeroinit) @@ -380,6 +404,10 @@ hypre_DeviceFree(void *ptr) HYPRE_HIP_CALL( hipFree(ptr) ); #endif +#if defined(HYPRE_USING_SYCL) + sycl::free(ptr, *(hypre_HandleSyclComputeQueue(hypre_handle()))); +#endif + #endif /* #if defined(HYPRE_USING_UMPIRE_DEVICE) */ } @@ -406,6 +434,10 @@ hypre_UnifiedFree(void *ptr) HYPRE_HIP_CALL( hipFree(ptr) ); #endif +#if defined(HYPRE_USING_SYCL) + sycl::free(ptr, *(hypre_HandleSyclComputeQueue(hypre_handle()))); +#endif + #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */ } @@ -428,6 +460,10 @@ hypre_HostPinnedFree(void *ptr) HYPRE_HIP_CALL( hipHostFree(ptr) ); #endif +#if defined(HYPRE_USING_SYCL) + sycl::free(ptr, *(hypre_HandleSyclComputeQueue(hypre_handle()))); +#endif + #endif /* #if defined(HYPRE_USING_UMPIRE_PINNED) */ } @@ -479,6 +515,10 @@ _hypre_Free(void *ptr, hypre_MemoryLocation location) static inline void hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_dst, hypre_MemoryLocation loc_src) { +#if defined(HYPRE_USING_SYCL) + sycl::queue* q = hypre_HandleSyclComputeQueue(hypre_handle()); +#endif + if (dst == NULL || src == NULL) { if (size) @@ -524,6 +564,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds #if defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyDeviceToDevice) ); #endif + +#if defined(HYPRE_USING_SYCL) + q->memcpy(dst, src, size).wait(); +#endif return; } @@ -542,6 +586,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds #if defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyHostToDevice) ); #endif + +#if defined(HYPRE_USING_SYCL) + q->memcpy(dst, src, size).wait(); +#endif return; } @@ -560,6 +608,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds #if defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyDeviceToHost) ); #endif + +#if defined(HYPRE_USING_SYCL) + q->memcpy(dst, src, size).wait(); +#endif return; } @@ -583,6 +635,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds #if defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyHostToDevice) ); #endif + +#if defined(HYPRE_USING_SYCL) + q->memcpy(dst, src, size).wait(); +#endif return; } @@ -606,6 +662,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds #if defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyDeviceToHost) ); #endif + +#if defined(HYPRE_USING_SYCL) + q->memcpy(dst, src, size).wait(); +#endif return; } @@ -630,6 +690,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds #if defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyDeviceToDevice) ); #endif + +#if defined(HYPRE_USING_SYCL) + q->memcpy(dst, src, size).wait(); +#endif return; } @@ -654,7 +718,7 @@ hypre_GetExecPolicy1_core(hypre_MemoryLocation location) exec = HYPRE_EXEC_DEVICE; break; case hypre_MEMORY_UNIFIED : -#if defined(HYPRE_USING_GPU) +#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL) exec = hypre_HandleDefaultExecPolicy(hypre_handle()); #endif break; @@ -701,7 +765,7 @@ hypre_GetExecPolicy2_core(hypre_MemoryLocation location1, if (location1 == hypre_MEMORY_UNIFIED && location2 == hypre_MEMORY_UNIFIED) { -#if defined(HYPRE_USING_GPU) +#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL) exec = hypre_HandleDefaultExecPolicy(hypre_handle()); #endif } @@ -907,7 +971,7 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location) { HYPRE_Int ierr = 0; -#if defined(HYPRE_USING_GPU) +#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL) *memory_location = hypre_MEMORY_UNDEFINED; #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) @@ -1002,7 +1066,30 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location) } #endif // defined(HYPRE_USING_HIP) -#else /* #if defined(HYPRE_USING_GPU) */ +#if defined(HYPRE_USING_SYCL) + *memory_location = hypre_MEMORY_UNDEFINED; + sycl::usm::alloc allocType; + allocType = sycl::get_pointer_type(ptr, (hypre_HandleSyclComputeQueue(hypre_handle()))->get_context()); + + if (allocType == sycl::usm::alloc::unknown) + { + *memory_location = hypre_MEMORY_HOST; + } + else if (allocType == sycl::usm::alloc::host) + { + *memory_location = hypre_MEMORY_HOST_PINNED; + } + else if (allocType == sycl::usm::alloc::device) + { + *memory_location = hypre_MEMORY_DEVICE; + } + else if (allocType == sycl::usm::alloc::shared) + { + *memory_location = hypre_MEMORY_UNIFIED; + } +#endif //HYPRE_USING_SYCL + +#else /* #if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL) */ *memory_location = hypre_MEMORY_HOST; #endif From 61d0edbd021044e98f099856db5bb2329e403706 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Tue, 27 Jul 2021 20:57:38 +0000 Subject: [PATCH 02/44] Change names and fix initialization This does a bunch of name changing of files, data structures, and variables from 'cuda' to 'device' in order to reflect which things are generic device functionality vs. tied to a specific language. In addition, this now compiles and runs a simple program that calls HYPRE_Init() and allocates/copies/frees memory on the device and host with unified memory. --- src/seq_mv/csr_matvec_device.c | 3 +- src/struct_ls/HYPRE_struct_int.c | 4 +- src/utilities/CMakeLists.txt | 4 +- src/utilities/Makefile | 5 +- src/utilities/_hypre_utilities.h | 70 ++--- src/utilities/_hypre_utilities.hpp | 256 +++++++++--------- .../{cuda_reducer.h => device_reducer.h} | 6 +- .../{cuda_utils.c => device_utils.c} | 140 ++++++---- .../{cuda_utils.h => device_utils.h} | 175 ++++++++---- src/utilities/general.c | 101 ++----- src/utilities/handle.h | 68 ++--- src/utilities/headers | 5 +- src/utilities/memory.c | 60 ++-- 13 files changed, 464 insertions(+), 433 deletions(-) rename src/utilities/{cuda_reducer.h => device_reducer.h} (96%) rename src/utilities/{cuda_utils.c => device_utils.c} (90%) rename src/utilities/{cuda_utils.h => device_utils.h} (79%) diff --git a/src/seq_mv/csr_matvec_device.c b/src/seq_mv/csr_matvec_device.c index cd273fd938..4751d7c384 100644 --- a/src/seq_mv/csr_matvec_device.c +++ b/src/seq_mv/csr_matvec_device.c @@ -51,7 +51,8 @@ hypre_CSRMatrixMatvecDevice2( HYPRE_Int trans, #elif defined(HYPRE_USING_ROCSPARSE) hypre_CSRMatrixMatvecRocsparse(trans, alpha, A, x, beta, y, offset); #else // #ifdef HYPRE_USING_CUSPARSE -#error HYPRE SPMV TODO +// WM: TODO: commenting this out for now, but put it back after sycl impelentation is done +/* #error HYPRE SPMV TODO */ #endif return hypre_error_flag; diff --git a/src/struct_ls/HYPRE_struct_int.c b/src/struct_ls/HYPRE_struct_int.c index e9048acbf7..abb1869fcd 100644 --- a/src/struct_ls/HYPRE_struct_int.c +++ b/src/struct_ls/HYPRE_struct_int.c @@ -71,7 +71,9 @@ hypre_StructVectorSetRandomValues( hypre_StructVector *vector, hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, v_data_box, start, unit_stride, vi); { -#if defined(HYPRE_USING_GPU) +// WM: TODO: temporary fix... remove after sycl implementation is done +#if defined(HYPRE_USING_SYCL) +#elif defined(HYPRE_USING_GPU) vp[vi] = rand_device[idx]; #else vp[vi] = 2.0*hypre_Rand() - 1.0; diff --git a/src/utilities/CMakeLists.txt b/src/utilities/CMakeLists.txt index 33c00cc53a..cd51132a21 100644 --- a/src/utilities/CMakeLists.txt +++ b/src/utilities/CMakeLists.txt @@ -22,7 +22,7 @@ set(SRCS fortran_matrix.c ap.c complex.c - cuda_utils.c + device_utils.c error.c general.c handle.c @@ -52,7 +52,7 @@ target_sources(${PROJECT_NAME} if (HYPRE_USING_CUDA) set(CUDA_SRCS HYPRE_handle.c - cuda_utils.c + device_utils.c handle.c general.c memory.c diff --git a/src/utilities/Makefile b/src/utilities/Makefile index 4b37e1cd3e..07581dd7d4 100644 --- a/src/utilities/Makefile +++ b/src/utilities/Makefile @@ -58,13 +58,12 @@ FILES =\ timing.c CUFILES=\ - cuda_utils.c\ + device_utils.c\ general.c\ handle.c\ memory.c\ omp_device.c\ - nvtx.c\ - sycl_utils.c + nvtx.c COBJS = ${FILES:.c=.o} CUOBJS = ${CUFILES:.c=.obj} diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index 503c13f2d3..b2da3c2bca 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -1206,11 +1206,8 @@ static char hypre__levelname[16]; #ifndef HYPRE_HANDLE_H #define HYPRE_HANDLE_H -struct hypre_CudaData; -typedef struct hypre_CudaData hypre_CudaData; - -struct hypre_SyclData; -typedef struct hypre_SyclData hypre_SyclData; +struct hypre_DeviceData; +typedef struct hypre_DeviceData hypre_DeviceData; typedef struct { @@ -1219,7 +1216,7 @@ typedef struct HYPRE_ExecutionPolicy default_exec_policy; HYPRE_ExecutionPolicy struct_exec_policy; #if defined(HYPRE_USING_GPU) - hypre_CudaData *cuda_data; + hypre_DeviceData *device_data; #endif #if defined(HYPRE_USING_UMPIRE) char umpire_device_pool_name[HYPRE_UMPIRE_POOL_NAME_MAX_LEN]; @@ -1237,43 +1234,39 @@ typedef struct HYPRE_Int own_umpire_pinned_pool; umpire_resourcemanager umpire_rm; #endif -#if defined(HYPRE_USING_SYCL) - hypre_SyclData *sycl_data; -#endif } hypre_Handle; /* accessor macros to hypre_Handle */ #define hypre_HandleMemoryLocation(hypre_handle) ((hypre_handle) -> memory_location) #define hypre_HandleDefaultExecPolicy(hypre_handle) ((hypre_handle) -> default_exec_policy) #define hypre_HandleStructExecPolicy(hypre_handle) ((hypre_handle) -> struct_exec_policy) -#define hypre_HandleCudaData(hypre_handle) ((hypre_handle) -> cuda_data) -#define hypre_HandleSyclData(hypre_handle) ((hypre_handle) -> sycl_data) - -#define hypre_HandleCurandGenerator(hypre_handle) hypre_CudaDataCurandGenerator(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCublasHandle(hypre_handle) hypre_CudaDataCublasHandle(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCusparseHandle(hypre_handle) hypre_CudaDataCusparseHandle(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCudaComputeStream(hypre_handle) hypre_CudaDataCudaComputeStream(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCubBinGrowth(hypre_handle) hypre_CudaDataCubBinGrowth(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCubMinBin(hypre_handle) hypre_CudaDataCubMinBin(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCubMaxBin(hypre_handle) hypre_CudaDataCubMaxBin(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCubMaxCachedBytes(hypre_handle) hypre_CudaDataCubMaxCachedBytes(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCubDevAllocator(hypre_handle) hypre_CudaDataCubDevAllocator(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCubUvmAllocator(hypre_handle) hypre_CudaDataCubUvmAllocator(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCudaDevice(hypre_handle) hypre_CudaDataCudaDevice(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCudaComputeStreamNum(hypre_handle) hypre_CudaDataCudaComputeStreamNum(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCudaReduceBuffer(hypre_handle) hypre_CudaDataCudaReduceBuffer(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleStructCommRecvBuffer(hypre_handle) hypre_CudaDataStructCommRecvBuffer(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleStructCommSendBuffer(hypre_handle) hypre_CudaDataStructCommSendBuffer(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleStructCommRecvBufferSize(hypre_handle) hypre_CudaDataStructCommRecvBufferSize(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleStructCommSendBufferSize(hypre_handle) hypre_CudaDataStructCommSendBufferSize(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleSpgemmUseCusparse(hypre_handle) hypre_CudaDataSpgemmUseCusparse(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleSpgemmNumPasses(hypre_handle) hypre_CudaDataSpgemmNumPasses(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle) hypre_CudaDataSpgemmRownnzEstimateMethod(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle) hypre_CudaDataSpgemmRownnzEstimateNsamples(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_CudaDataSpgemmRownnzEstimateMultFactor(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleSpgemmHashType(hypre_handle) hypre_CudaDataSpgemmHashType(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleUmpireDeviceAllocator(hypre_handle) hypre_CudaDataUmpireDeviceAllocator(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleUseGpuRand(hypre_handle) hypre_CudaDataUseGpuRand(hypre_HandleCudaData(hypre_handle)) +#define hypre_HandleDeviceData(hypre_handle) ((hypre_handle) -> device_data) + +#define hypre_HandleCurandGenerator(hypre_handle) hypre_DeviceDataCurandGenerator(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCublasHandle(hypre_handle) hypre_DeviceDataCublasHandle(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCusparseHandle(hypre_handle) hypre_DeviceDataCusparseHandle(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleComputeStream(hypre_handle) hypre_DeviceDataComputeStream(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCubBinGrowth(hypre_handle) hypre_DeviceDataCubBinGrowth(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCubMinBin(hypre_handle) hypre_DeviceDataCubMinBin(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCubMaxBin(hypre_handle) hypre_DeviceDataCubMaxBin(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCubMaxCachedBytes(hypre_handle) hypre_DeviceDataCubMaxCachedBytes(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCubDevAllocator(hypre_handle) hypre_DeviceDataCubDevAllocator(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCubUvmAllocator(hypre_handle) hypre_DeviceDataCubUvmAllocator(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleDevice(hypre_handle) hypre_DeviceDataDevice(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleComputeStreamNum(hypre_handle) hypre_DeviceDataComputeStreamNum(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleReduceBuffer(hypre_handle) hypre_DeviceDataReduceBuffer(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleStructCommRecvBuffer(hypre_handle) hypre_DeviceDataStructCommRecvBuffer(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleStructCommSendBuffer(hypre_handle) hypre_DeviceDataStructCommSendBuffer(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleStructCommRecvBufferSize(hypre_handle) hypre_DeviceDataStructCommRecvBufferSize(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleStructCommSendBufferSize(hypre_handle) hypre_DeviceDataStructCommSendBufferSize(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmUseCusparse(hypre_handle) hypre_DeviceDataSpgemmUseCusparse(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmNumPasses(hypre_handle) hypre_DeviceDataSpgemmNumPasses(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMethod(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateNsamples(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMultFactor(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmHashType(hypre_handle) hypre_DeviceDataSpgemmHashType(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleUmpireDeviceAllocator(hypre_handle) hypre_DeviceDataUmpireDeviceAllocator(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleUseGpuRand(hypre_handle) hypre_DeviceDataUseGpuRand(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleUmpireResourceMan(hypre_handle) ((hypre_handle) -> umpire_rm) #define hypre_HandleUmpireDevicePoolSize(hypre_handle) ((hypre_handle) -> umpire_device_pool_size) @@ -1290,9 +1283,6 @@ typedef struct #define hypre_HandleOwnUmpireHostPool(hypre_handle) ((hypre_handle) -> own_umpire_host_pool) #define hypre_HandleOwnUmpirePinnedPool(hypre_handle) ((hypre_handle) -> own_umpire_pinned_pool) -#define hypre_HandleSyclComputeQueue(hypre_handle) hypre_SyclDataSyclComputeQueue(hypre_HandleSyclData(hypre_handle)) -#define hypre_HandleSyclDevice(hypre_handle) hypre_SyclDataSyclDevice(hypre_HandleSyclData(hypre_handle)) -#define hypre_HandleSyclComputeQueueNum(hypre_handle) hypre_SyclDataSyclComputeQueueNum(hypre_HandleSyclData(hypre_handle)) #endif /****************************************************************************** * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 4d4eed5220..c95eba5773 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -75,6 +75,10 @@ struct hypre_umpire_device_allocator #if defined(HYPRE_USING_GPU) +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * cuda includes + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) #include #include @@ -100,16 +104,45 @@ struct hypre_umpire_device_allocator #define CUSPARSE_NEWAPI_VERSION 11000 +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * hip includes + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + #elif defined(HYPRE_USING_HIP) #include +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * sycl includes + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +#elif defined(HYPRE_USING_SYCL) + +#include +/* #include */ +/* #include */ +/* #include */ +/* #include */ + +/* #include // dpct::remove_if, remove_copy_if, copy_if */ + +/* #include */ +/* #include */ +/* #include */ +/* #include */ + +/* #include */ +/* #include */ + #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) #if defined(HYPRE_USING_ROCSPARSE) #include #endif +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * macros for wrapping cuda/hip/sycl calls for error reporting + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) #define HYPRE_CUDA_CALL(call) do { \ @@ -129,6 +162,25 @@ struct hypre_umpire_device_allocator hypre_assert(0); exit(1); \ } } while(0) +#elif defined(HYPRE_USING_SYCL) +#define HYPRE_SYCL_CALL(call) \ + try \ + { \ + call; \ + } \ + catch (sycl::exception const &ex) \ + { \ + hypre_printf("SYCL ERROR (code = %s) at %s:%d\n", ex.what(), \ + __FILE__, __LINE__); \ + assert(0); exit(1); \ + } \ + catch(std::runtime_error const& ex) \ + { \ + hypre_printf("STD ERROR (code = %s) at %s:%d\n", ex.what(), \ + __FILE__, __LINE__); \ + assert(0); exit(1); \ + } + #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) #define HYPRE_CUBLAS_CALL(call) do { \ @@ -163,11 +215,12 @@ struct hypre_umpire_device_allocator hypre_assert(0); exit(1); \ } } while(0) -struct hypre_cub_CachingDeviceAllocator; -typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator; +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * device defined values + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ // HYPRE_WARP_BITSHIFT is just log2 of HYPRE_WARP_SIZE -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) || defined(HYPRE_USING_SYCL) #define HYPRE_WARP_SIZE 32 #define HYPRE_WARP_BITSHIFT 5 #elif defined(HYPRE_USING_HIP) @@ -181,7 +234,14 @@ typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator #define HYPRE_1D_BLOCK_SIZE 512 #define HYPRE_MAX_NUM_STREAMS 10 -struct hypre_CudaData +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * device info data structures + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +struct hypre_cub_CachingDeviceAllocator; +typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator; + +struct hypre_DeviceData { #if defined(HYPRE_USING_CURAND) curandGenerator_t curand_generator; @@ -200,9 +260,11 @@ struct hypre_CudaData #endif #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) - cudaStream_t cuda_streams[HYPRE_MAX_NUM_STREAMS]; + cudaStream_t streams[HYPRE_MAX_NUM_STREAMS]; #elif defined(HYPRE_USING_HIP) - hipStream_t cuda_streams[HYPRE_MAX_NUM_STREAMS]; + hipStream_t streams[HYPRE_MAX_NUM_STREAMS]; +#elif defined(HYPRE_USING_SYCL) + sycl::queue* streams[HYPRE_MAX_NUM_STREAMS] = {NULL}; #endif #ifdef HYPRE_USING_DEVICE_POOL @@ -216,12 +278,16 @@ struct hypre_CudaData #ifdef HYPRE_USING_UMPIRE_DEVICE hypre_umpire_device_allocator umpire_device_allocator; #endif - HYPRE_Int cuda_device; +#if defined(HYPRE_USING_SYCL) + sycl::device device; +#else + HYPRE_Int device; +#endif /* by default, hypre puts GPU computations in this stream - * Do not be confused with the default (null) CUDA stream */ - HYPRE_Int cuda_compute_stream_num; - /* work space for hypre's CUDA reducer */ - void *cuda_reduce_buffer; + * Do not be confused with the default (null) stream */ + HYPRE_Int compute_stream_num; + /* work space for hypre's device reducer */ + void *reduce_buffer; /* the device buffers needed to do MPI communication for struct comm */ HYPRE_Complex* struct_comm_recv_buffer; HYPRE_Complex* struct_comm_send_buffer; @@ -238,53 +304,56 @@ struct hypre_CudaData HYPRE_Int use_gpu_rand; }; -#define hypre_CudaDataCubBinGrowth(data) ((data) -> cub_bin_growth) -#define hypre_CudaDataCubMinBin(data) ((data) -> cub_min_bin) -#define hypre_CudaDataCubMaxBin(data) ((data) -> cub_max_bin) -#define hypre_CudaDataCubMaxCachedBytes(data) ((data) -> cub_max_cached_bytes) -#define hypre_CudaDataCubDevAllocator(data) ((data) -> cub_dev_allocator) -#define hypre_CudaDataCubUvmAllocator(data) ((data) -> cub_uvm_allocator) -#define hypre_CudaDataCudaDevice(data) ((data) -> cuda_device) -#define hypre_CudaDataCudaComputeStreamNum(data) ((data) -> cuda_compute_stream_num) -#define hypre_CudaDataCudaReduceBuffer(data) ((data) -> cuda_reduce_buffer) -#define hypre_CudaDataStructCommRecvBuffer(data) ((data) -> struct_comm_recv_buffer) -#define hypre_CudaDataStructCommSendBuffer(data) ((data) -> struct_comm_send_buffer) -#define hypre_CudaDataStructCommRecvBufferSize(data) ((data) -> struct_comm_recv_buffer_size) -#define hypre_CudaDataStructCommSendBufferSize(data) ((data) -> struct_comm_send_buffer_size) -#define hypre_CudaDataSpgemmUseCusparse(data) ((data) -> spgemm_use_cusparse) -#define hypre_CudaDataSpgemmNumPasses(data) ((data) -> spgemm_num_passes) -#define hypre_CudaDataSpgemmRownnzEstimateMethod(data) ((data) -> spgemm_rownnz_estimate_method) -#define hypre_CudaDataSpgemmRownnzEstimateNsamples(data) ((data) -> spgemm_rownnz_estimate_nsamples) -#define hypre_CudaDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor) -#define hypre_CudaDataSpgemmHashType(data) ((data) -> spgemm_hash_type) -#define hypre_CudaDataUmpireDeviceAllocator(data) ((data) -> umpire_device_allocator) -#define hypre_CudaDataUseGpuRand(data) ((data) -> use_gpu_rand) - -hypre_CudaData* hypre_CudaDataCreate(); -void hypre_CudaDataDestroy(hypre_CudaData* data); +#define hypre_DeviceDataCubBinGrowth(data) ((data) -> cub_bin_growth) +#define hypre_DeviceDataCubMinBin(data) ((data) -> cub_min_bin) +#define hypre_DeviceDataCubMaxBin(data) ((data) -> cub_max_bin) +#define hypre_DeviceDataCubMaxCachedBytes(data) ((data) -> cub_max_cached_bytes) +#define hypre_DeviceDataCubDevAllocator(data) ((data) -> cub_dev_allocator) +#define hypre_DeviceDataCubUvmAllocator(data) ((data) -> cub_uvm_allocator) +#define hypre_DeviceDataDevice(data) ((data) -> device) +#define hypre_DeviceDataComputeStreamNum(data) ((data) -> compute_stream_num) +#define hypre_DeviceDataReduceBuffer(data) ((data) -> reduce_buffer) +#define hypre_DeviceDataStructCommRecvBuffer(data) ((data) -> struct_comm_recv_buffer) +#define hypre_DeviceDataStructCommSendBuffer(data) ((data) -> struct_comm_send_buffer) +#define hypre_DeviceDataStructCommRecvBufferSize(data) ((data) -> struct_comm_recv_buffer_size) +#define hypre_DeviceDataStructCommSendBufferSize(data) ((data) -> struct_comm_send_buffer_size) +#define hypre_DeviceDataSpgemmUseCusparse(data) ((data) -> spgemm_use_cusparse) +#define hypre_DeviceDataSpgemmNumPasses(data) ((data) -> spgemm_num_passes) +#define hypre_DeviceDataSpgemmRownnzEstimateMethod(data) ((data) -> spgemm_rownnz_estimate_method) +#define hypre_DeviceDataSpgemmRownnzEstimateNsamples(data) ((data) -> spgemm_rownnz_estimate_nsamples) +#define hypre_DeviceDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor) +#define hypre_DeviceDataSpgemmHashType(data) ((data) -> spgemm_hash_type) +#define hypre_DeviceDataUmpireDeviceAllocator(data) ((data) -> umpire_device_allocator) +#define hypre_DeviceDataUseGpuRand(data) ((data) -> use_gpu_rand) + +hypre_DeviceData* hypre_DeviceDataCreate(); +void hypre_DeviceDataDestroy(hypre_DeviceData* data); #if defined(HYPRE_USING_CURAND) -curandGenerator_t hypre_CudaDataCurandGenerator(hypre_CudaData *data); +curandGenerator_t hypre_DeviceDataCurandGenerator(hypre_DeviceData *data); #endif #if defined(HYPRE_USING_CUBLAS) -cublasHandle_t hypre_CudaDataCublasHandle(hypre_CudaData *data); +cublasHandle_t hypre_DeviceDataCublasHandle(hypre_DeviceData *data); #endif #if defined(HYPRE_USING_CUSPARSE) -cusparseHandle_t hypre_CudaDataCusparseHandle(hypre_CudaData *data); +cusparseHandle_t hypre_DeviceDataCusparseHandle(hypre_DeviceData *data); #endif #if defined(HYPRE_USING_ROCSPARSE) -rocsparse_handle hypre_CudaDataCusparseHandle(hypre_CudaData *data); +rocsparse_handle hypre_DeviceDataCusparseHandle(hypre_DeviceData *data); #endif #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) -cudaStream_t hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i); -cudaStream_t hypre_CudaDataCudaComputeStream(hypre_CudaData *data); +cudaStream_t hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i); +cudaStream_t hypre_DeviceDataComputeStream(hypre_DeviceData *data); #elif defined(HYPRE_USING_HIP) -hipStream_t hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i); -hipStream_t hypre_CudaDataCudaComputeStream(hypre_CudaData *data); +hipStream_t hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i); +hipStream_t hypre_DeviceDataComputeStream(hypre_DeviceData *data); +#elif defined(HYPRE_USING_SYCL) +sycl::queue* hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i); +sycl::queue* hypre_DeviceDataComputeStream(hypre_DeviceData *data); #endif // Data structure and accessor routines for Cuda Sparse Triangular Matrices @@ -368,7 +437,7 @@ using namespace thrust::placeholders; } \ else \ { \ - (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ + (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ } \ hypre_SyncCudaComputeStream(hypre_handle()); \ HYPRE_CUDA_CALL( cudaGetLastError() ); \ @@ -385,7 +454,7 @@ using namespace thrust::placeholders; } \ else \ { \ - (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ + (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ } \ hypre_SyncCudaComputeStream(hypre_handle()); \ HYPRE_HIP_CALL( hipGetLastError() ); \ @@ -405,7 +474,7 @@ using namespace thrust::placeholders; } \ else \ { \ - (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ + (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ } \ } @@ -418,26 +487,26 @@ using namespace thrust::placeholders; #if defined(HYPRE_USING_CUDA) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::cuda::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::cuda::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); #elif defined(HYPRE_USING_HIP) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::hip::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::hip::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); #endif // HYPRE_USING_CUDA #elif HYPRE_USING_DEVICE_POOL #if defined(HYPRE_USING_CUDA) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::cuda::par(*(hypre_HandleCubDevAllocator(hypre_handle()))).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::cuda::par(*(hypre_HandleCubDevAllocator(hypre_handle()))).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); #endif #else #if defined(HYPRE_USING_CUDA) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::cuda::par.on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::cuda::par.on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); #elif defined(HYPRE_USING_HIP) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::hip::par.on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::hip::par.on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); #endif // HYPRE_USING_CUDA #endif // HYPRE_USING_UMPIRE_DEVICE @@ -972,9 +1041,9 @@ cudaError_t hypre_CachingFreeDevice(void *ptr); cudaError_t hypre_CachingFreeManaged(void *ptr); #endif -hypre_cub_CachingDeviceAllocator * hypre_CudaDataCubCachingAllocatorCreate(hypre_uint bin_growth, hypre_uint min_bin, hypre_uint max_bin, size_t max_cached_bytes, bool skip_cleanup, bool debug, bool use_managed_memory); +hypre_cub_CachingDeviceAllocator * hypre_DeviceDataCubCachingAllocatorCreate(hypre_uint bin_growth, hypre_uint min_bin, hypre_uint max_bin, size_t max_cached_bytes, bool skip_cleanup, bool debug, bool use_managed_memory); -void hypre_CudaDataCubCachingAllocatorDestroy(hypre_CudaData *data); +void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) @@ -1200,13 +1269,13 @@ struct ReduceSum __thread_sum = 0.0; nblocks = -1; - if (hypre_HandleCudaReduceBuffer(hypre_handle()) == NULL) + if (hypre_HandleReduceBuffer(hypre_handle()) == NULL) { /* allocate for the max size for reducing double6 type */ - hypre_HandleCudaReduceBuffer(hypre_handle()) = hypre_TAlloc(HYPRE_double6, 1024, HYPRE_MEMORY_DEVICE); + hypre_HandleReduceBuffer(hypre_handle()) = hypre_TAlloc(HYPRE_double6, 1024, HYPRE_MEMORY_DEVICE); } - d_buf = (T*) hypre_HandleCudaReduceBuffer(hypre_handle()); + d_buf = (T*) hypre_HandleReduceBuffer(hypre_handle()); } /* copy constructor */ @@ -2111,81 +2180,6 @@ struct hypre_cub_CachingDeviceAllocator #endif // #if defined(HYPRE_USING_CUDA) && defined(HYPRE_USING_DEVICE_POOL) #endif // #ifndef HYPRE_CUB_ALLOCATOR_HEADER -/****************************************************************************** - * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other - * HYPRE Project Developers. See the top-level COPYRIGHT file for details. - * - * SPDX-License-Identifier: (Apache-2.0 OR MIT) - ******************************************************************************/ - -#ifndef HYPRE_SYCL_UTILS_HPP -#define HYPRE_SYCL_UTILS_HPP - -#if defined(HYPRE_USING_SYCL) - -/* #include */ -/* #include */ -/* #include */ -/* #include */ - -/* #include // dpct::remove_if, remove_copy_if, copy_if */ - -/* #include */ -/* #include */ -/* #include */ -/* #include */ - -#include -/* #include */ -/* #include */ - -#define HYPRE_SYCL_CALL( EXPR ) \ - try \ - { \ - EXPR; \ - } \ - catch (sycl::exception const &ex) \ - { \ - hypre_printf("SYCL ERROR (code = %s) at %s:%d\n", ex.what(), \ - __FILE__, __LINE__); \ - assert(0); exit(1); \ - } \ - catch(std::runtime_error const& ex) { \ - hypre_printf("STD ERROR (code = %s) at %s:%d\n", ex.what(), \ - __FILE__, __LINE__); \ - assert(0); exit(1); \ - } - -// HYPRE_SUBGROUP_BITSHIFT is just log2 of HYPRE_SUBGROUP_SIZE -#define HYPRE_SUBGROUP_SIZE 32 -#define HYPRE_SUBGROUP_BITSHIFT 5 -#define HYPRE_MAX_NUM_SUBGROUPS (64 * 64 * 32) -#define HYPRE_FLT_LARGE 1e30 -#define HYPRE_1D_BLOCK_SIZE 512 -#define HYPRE_MAX_NUM_QUEUES 10 - -struct hypre_SyclData -{ - sycl::queue* sycl_queues[HYPRE_MAX_NUM_QUEUES] = {}; - sycl::device sycl_device; - - /* by default, hypre puts GPU computations in this queue - * Do not be confused with the default (null) SYCL queue */ - HYPRE_Int sycl_compute_queue_num; -}; - -#define hypre_SyclDataSyclDevice(data) ((data) -> sycl_device) -#define hypre_SyclDataSyclComputeQueueNum(data) ((data) -> sycl_compute_queue_num) - -hypre_SyclData* hypre_SyclDataCreate(); -void hypre_SyclDataDestroy(hypre_SyclData* data); - -sycl::queue *hypre_SyclDataSyclQueue(hypre_SyclData *data, HYPRE_Int i); -sycl::queue *hypre_SyclDataSyclComputeQueue(hypre_SyclData *data); - -#endif // #if defined(HYPRE_USING_SYCL) - -#endif /* #ifndef HYPRE_SYCL_UTILS_HPP */ #ifdef __cplusplus } diff --git a/src/utilities/cuda_reducer.h b/src/utilities/device_reducer.h similarity index 96% rename from src/utilities/cuda_reducer.h rename to src/utilities/device_reducer.h index d489bb589e..729bbce535 100644 --- a/src/utilities/cuda_reducer.h +++ b/src/utilities/device_reducer.h @@ -211,13 +211,13 @@ struct ReduceSum __thread_sum = 0.0; nblocks = -1; - if (hypre_HandleCudaReduceBuffer(hypre_handle()) == NULL) + if (hypre_HandleReduceBuffer(hypre_handle()) == NULL) { /* allocate for the max size for reducing double6 type */ - hypre_HandleCudaReduceBuffer(hypre_handle()) = hypre_TAlloc(HYPRE_double6, 1024, HYPRE_MEMORY_DEVICE); + hypre_HandleReduceBuffer(hypre_handle()) = hypre_TAlloc(HYPRE_double6, 1024, HYPRE_MEMORY_DEVICE); } - d_buf = (T*) hypre_HandleCudaReduceBuffer(hypre_handle()); + d_buf = (T*) hypre_HandleReduceBuffer(hypre_handle()); } /* copy constructor */ diff --git a/src/utilities/cuda_utils.c b/src/utilities/device_utils.c similarity index 90% rename from src/utilities/cuda_utils.c rename to src/utilities/device_utils.c index 4fd055d90e..f9043d0e35 100644 --- a/src/utilities/cuda_utils.c +++ b/src/utilities/device_utils.c @@ -35,7 +35,7 @@ void hypre_CudaCompileFlagCheck() // This is really only defined for CUDA and not for HIP #if defined(HYPRE_USING_CUDA) - HYPRE_Int device = hypre_HandleCudaDevice(hypre_handle()); + HYPRE_Int device = hypre_HandleDevice(hypre_handle()); struct cudaDeviceProp props; cudaGetDeviceProperties(&props, device); @@ -852,8 +852,10 @@ cudaStream_t cudaStream_t #elif defined(HYPRE_USING_HIP) hipStream_t +#elif defined(HYPRE_USING_SYCL) +sycl::queue* #endif -hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i) +hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i) { #if defined(HYPRE_USING_DEVICE_OPENMP) cudaStream_t stream = 0; @@ -861,6 +863,41 @@ hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i) cudaStream_t stream = 0; #elif defined(HYPRE_USING_HIP) hipStream_t stream = 0; +#elif defined(HYPRE_USING_SYCL) + sycl::queue *stream = NULL; + if (i >= HYPRE_MAX_NUM_STREAMS) + { + hypre_printf("SYCL queue %d exceeds the max number %d\n", + i, HYPRE_MAX_NUM_STREAMS); + return NULL; + } + if (data->streams[i]) + { + return data->streams[i]; + } + else + { + auto sycl_asynchandler = [] (sycl::exception_list exceptions) + { + for (std::exception_ptr const& e : exceptions) + { + try + { + std::rethrow_exception(e); + } + catch (sycl::exception const& ex) + { + std::cout << "Caught asynchronous SYCL exception:" << std::endl + << ex.what() << ", OpenCL code: " << ex.get_cl_code() << std::endl; + } + } + }; + + sycl::device syclDev = data->device; + sycl::context syclctxt = sycl::context(syclDev, sycl_asynchandler); + stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); + data->streams[i] = stream; + } #endif #if defined(HYPRE_USING_CUDA_STREAMS) @@ -874,9 +911,9 @@ hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i) return NULL; } - if (data->cuda_streams[i]) + if (data->streams[i]) { - return data->cuda_streams[i]; + return data->streams[i]; } #if defined(HYPRE_USING_DEVICE_OPENMP) @@ -888,7 +925,7 @@ hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i) HYPRE_HIP_CALL(hipStreamCreateWithFlags(&stream, hipStreamDefault)); #endif - data->cuda_streams[i] = stream; + data->streams[i] = stream; #endif return stream; @@ -900,16 +937,18 @@ cudaStream_t cudaStream_t #elif defined(HYPRE_USING_HIP) hipStream_t +#elif defined(HYPRE_USING_SYCL) +sycl::queue* #endif -hypre_CudaDataCudaComputeStream(hypre_CudaData *data) +hypre_DeviceDataComputeStream(hypre_DeviceData *data) { - return hypre_CudaDataCudaStream(data, - hypre_CudaDataCudaComputeStreamNum(data)); + return hypre_DeviceDataStream(data, + hypre_DeviceDataComputeStreamNum(data)); } #if defined(HYPRE_USING_CURAND) curandGenerator_t -hypre_CudaDataCurandGenerator(hypre_CudaData *data) +hypre_DeviceDataCurandGenerator(hypre_DeviceData *data) { if (data->curand_generator) { @@ -919,7 +958,7 @@ hypre_CudaDataCurandGenerator(hypre_CudaData *data) curandGenerator_t gen; HYPRE_CURAND_CALL( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) ); HYPRE_CURAND_CALL( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) ); - HYPRE_CURAND_CALL( curandSetStream(gen, hypre_CudaDataCudaComputeStream(data)) ); + HYPRE_CURAND_CALL( curandSetStream(gen, hypre_DeviceDataComputeStream(data)) ); data->curand_generator = gen; @@ -1005,7 +1044,7 @@ hypre_CurandUniformSingle( HYPRE_Int n, #if defined(HYPRE_USING_CUBLAS) cublasHandle_t -hypre_CudaDataCublasHandle(hypre_CudaData *data) +hypre_DeviceDataCublasHandle(hypre_DeviceData *data) { if (data->cublas_handle) { @@ -1015,7 +1054,7 @@ hypre_CudaDataCublasHandle(hypre_CudaData *data) cublasHandle_t handle; HYPRE_CUBLAS_CALL( cublasCreate(&handle) ); - HYPRE_CUBLAS_CALL( cublasSetStream(handle, hypre_CudaDataCudaComputeStream(data)) ); + HYPRE_CUBLAS_CALL( cublasSetStream(handle, hypre_DeviceDataComputeStream(data)) ); data->cublas_handle = handle; @@ -1025,7 +1064,7 @@ hypre_CudaDataCublasHandle(hypre_CudaData *data) #if defined(HYPRE_USING_CUSPARSE) cusparseHandle_t -hypre_CudaDataCusparseHandle(hypre_CudaData *data) +hypre_DeviceDataCusparseHandle(hypre_DeviceData *data) { if (data->cusparse_handle) { @@ -1035,7 +1074,7 @@ hypre_CudaDataCusparseHandle(hypre_CudaData *data) cusparseHandle_t handle; HYPRE_CUSPARSE_CALL( cusparseCreate(&handle) ); - HYPRE_CUSPARSE_CALL( cusparseSetStream(handle, hypre_CudaDataCudaComputeStream(data)) ); + HYPRE_CUSPARSE_CALL( cusparseSetStream(handle, hypre_DeviceDataComputeStream(data)) ); data->cusparse_handle = handle; @@ -1046,7 +1085,7 @@ hypre_CudaDataCusparseHandle(hypre_CudaData *data) #if defined(HYPRE_USING_ROCSPARSE) rocsparse_handle -hypre_CudaDataCusparseHandle(hypre_CudaData *data) +hypre_DeviceDataCusparseHandle(hypre_DeviceData *data) { if (data->cusparse_handle) { @@ -1056,7 +1095,7 @@ hypre_CudaDataCusparseHandle(hypre_CudaData *data) rocsparse_handle handle; HYPRE_ROCSPARSE_CALL( rocsparse_create_handle(&handle) ); - HYPRE_ROCSPARSE_CALL( rocsparse_set_stream(handle, hypre_CudaDataCudaComputeStream(data)) ); + HYPRE_ROCSPARSE_CALL( rocsparse_set_stream(handle, hypre_DeviceDataComputeStream(data)) ); data->cusparse_handle = handle; @@ -1066,58 +1105,62 @@ hypre_CudaDataCusparseHandle(hypre_CudaData *data) -hypre_CudaData* -hypre_CudaDataCreate() +hypre_DeviceData* +hypre_DeviceDataCreate() { - hypre_CudaData *data = hypre_CTAlloc(hypre_CudaData, 1, HYPRE_MEMORY_HOST); + hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST); - hypre_CudaDataCudaDevice(data) = 0; - hypre_CudaDataCudaComputeStreamNum(data) = 0; +#if defined(HYPRE_USING_SYCL) + hypre_DeviceDataDevice(data) = sycl::device(sycl::gpu_selector{}); +#else + hypre_DeviceDataDevice(data) = 0; +#endif + hypre_DeviceDataComputeStreamNum(data) = 0; /* SpGeMM */ #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) - hypre_CudaDataSpgemmUseCusparse(data) = 1; + hypre_DeviceDataSpgemmUseCusparse(data) = 1; #else - hypre_CudaDataSpgemmUseCusparse(data) = 0; + hypre_DeviceDataSpgemmUseCusparse(data) = 0; #endif - hypre_CudaDataSpgemmNumPasses(data) = 3; + hypre_DeviceDataSpgemmNumPasses(data) = 3; /* 1: naive overestimate, 2: naive underestimate, 3: Cohen's algorithm */ - hypre_CudaDataSpgemmRownnzEstimateMethod(data) = 3; - hypre_CudaDataSpgemmRownnzEstimateNsamples(data) = 32; - hypre_CudaDataSpgemmRownnzEstimateMultFactor(data) = 1.5; - hypre_CudaDataSpgemmHashType(data) = 'L'; + hypre_DeviceDataSpgemmRownnzEstimateMethod(data) = 3; + hypre_DeviceDataSpgemmRownnzEstimateNsamples(data) = 32; + hypre_DeviceDataSpgemmRownnzEstimateMultFactor(data) = 1.5; + hypre_DeviceDataSpgemmHashType(data) = 'L'; /* pmis */ #ifdef HYPRE_USING_CURAND - hypre_CudaDataUseGpuRand(data) = 1; + hypre_DeviceDataUseGpuRand(data) = 1; #else - hypre_CudaDataUseGpuRand(data) = 0; + hypre_DeviceDataUseGpuRand(data) = 0; #endif /* device pool */ #ifdef HYPRE_USING_DEVICE_POOL - hypre_CudaDataCubBinGrowth(data) = 8u; - hypre_CudaDataCubMinBin(data) = 1u; - hypre_CudaDataCubMaxBin(data) = (hypre_uint) -1; - hypre_CudaDataCubMaxCachedBytes(data) = (size_t) -1; - hypre_CudaDataCubDevAllocator(data) = NULL; - hypre_CudaDataCubUvmAllocator(data) = NULL; + hypre_DeviceDataCubBinGrowth(data) = 8u; + hypre_DeviceDataCubMinBin(data) = 1u; + hypre_DeviceDataCubMaxBin(data) = (hypre_uint) -1; + hypre_DeviceDataCubMaxCachedBytes(data) = (size_t) -1; + hypre_DeviceDataCubDevAllocator(data) = NULL; + hypre_DeviceDataCubUvmAllocator(data) = NULL; #endif return data; } void -hypre_CudaDataDestroy(hypre_CudaData *data) +hypre_DeviceDataDestroy(hypre_DeviceData *data) { if (!data) { return; } - hypre_TFree(hypre_CudaDataCudaReduceBuffer(data), HYPRE_MEMORY_DEVICE); - hypre_TFree(hypre_CudaDataStructCommRecvBuffer(data), HYPRE_MEMORY_DEVICE); - hypre_TFree(hypre_CudaDataStructCommSendBuffer(data), HYPRE_MEMORY_DEVICE); + hypre_TFree(hypre_DeviceDataReduceBuffer(data), HYPRE_MEMORY_DEVICE); + hypre_TFree(hypre_DeviceDataStructCommRecvBuffer(data), HYPRE_MEMORY_DEVICE); + hypre_TFree(hypre_DeviceDataStructCommSendBuffer(data), HYPRE_MEMORY_DEVICE); #if defined(HYPRE_USING_CURAND) if (data->curand_generator) @@ -1146,20 +1189,23 @@ hypre_CudaDataDestroy(hypre_CudaData *data) for (HYPRE_Int i = 0; i < HYPRE_MAX_NUM_STREAMS; i++) { - if (data->cuda_streams[i]) + if (data->streams[i]) { #if defined(HYPRE_USING_DEVICE_OPENMP) - HYPRE_CUDA_CALL( cudaStreamDestroy(data->cuda_streams[i]) ); + HYPRE_CUDA_CALL( cudaStreamDestroy(data->streams[i]) ); #elif defined(HYPRE_USING_CUDA) - HYPRE_CUDA_CALL( cudaStreamDestroy(data->cuda_streams[i]) ); + HYPRE_CUDA_CALL( cudaStreamDestroy(data->streams[i]) ); #elif defined(HYPRE_USING_HIP) - HYPRE_HIP_CALL( hipStreamDestroy(data->cuda_streams[i]) ); + HYPRE_HIP_CALL( hipStreamDestroy(data->streams[i]) ); +#elif defined(HYPRE_USING_SYCL) + delete data->streams[i]; + data->streams[i] = nullptr; #endif } } #ifdef HYPRE_USING_DEVICE_POOL - hypre_CudaDataCubCachingAllocatorDestroy(data); + hypre_DeviceDataCubCachingAllocatorDestroy(data); #endif hypre_TFree(data, HYPRE_MEMORY_HOST); @@ -1222,9 +1268,9 @@ hypre_SyncCudaComputeStream_core(HYPRE_Int action, if (cuda_compute_stream_sync) { #if defined(HYPRE_USING_CUDA) - HYPRE_CUDA_CALL( cudaStreamSynchronize(hypre_HandleCudaComputeStream(hypre_handle)) ); + HYPRE_CUDA_CALL( cudaStreamSynchronize(hypre_HandleComputeStream(hypre_handle)) ); #elif defined(HYPRE_USING_HIP) - HYPRE_HIP_CALL( hipStreamSynchronize(hypre_HandleCudaComputeStream(hypre_handle)) ); + HYPRE_HIP_CALL( hipStreamSynchronize(hypre_HandleComputeStream(hypre_handle)) ); #endif } #endif diff --git a/src/utilities/cuda_utils.h b/src/utilities/device_utils.h similarity index 79% rename from src/utilities/cuda_utils.h rename to src/utilities/device_utils.h index 4394a892aa..7d4030cd5e 100644 --- a/src/utilities/cuda_utils.h +++ b/src/utilities/device_utils.h @@ -10,6 +10,10 @@ #if defined(HYPRE_USING_GPU) +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * cuda includes + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) #include #include @@ -35,16 +39,45 @@ #define CUSPARSE_NEWAPI_VERSION 11000 +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * hip includes + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + #elif defined(HYPRE_USING_HIP) #include +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * sycl includes + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +#elif defined(HYPRE_USING_SYCL) + +#include +/* #include */ +/* #include */ +/* #include */ +/* #include */ + +/* #include // dpct::remove_if, remove_copy_if, copy_if */ + +/* #include */ +/* #include */ +/* #include */ +/* #include */ + +/* #include */ +/* #include */ + #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) #if defined(HYPRE_USING_ROCSPARSE) #include #endif +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * macros for wrapping cuda/hip/sycl calls for error reporting + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) #define HYPRE_CUDA_CALL(call) do { \ @@ -64,6 +97,25 @@ hypre_assert(0); exit(1); \ } } while(0) +#elif defined(HYPRE_USING_SYCL) +#define HYPRE_SYCL_CALL(call) \ + try \ + { \ + call; \ + } \ + catch (sycl::exception const &ex) \ + { \ + hypre_printf("SYCL ERROR (code = %s) at %s:%d\n", ex.what(), \ + __FILE__, __LINE__); \ + assert(0); exit(1); \ + } \ + catch(std::runtime_error const& ex) \ + { \ + hypre_printf("STD ERROR (code = %s) at %s:%d\n", ex.what(), \ + __FILE__, __LINE__); \ + assert(0); exit(1); \ + } + #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) #define HYPRE_CUBLAS_CALL(call) do { \ @@ -98,11 +150,12 @@ hypre_assert(0); exit(1); \ } } while(0) -struct hypre_cub_CachingDeviceAllocator; -typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator; +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * device defined values + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ // HYPRE_WARP_BITSHIFT is just log2 of HYPRE_WARP_SIZE -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) || defined(HYPRE_USING_SYCL) #define HYPRE_WARP_SIZE 32 #define HYPRE_WARP_BITSHIFT 5 #elif defined(HYPRE_USING_HIP) @@ -116,7 +169,14 @@ typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator #define HYPRE_1D_BLOCK_SIZE 512 #define HYPRE_MAX_NUM_STREAMS 10 -struct hypre_CudaData +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * device info data structures + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +struct hypre_cub_CachingDeviceAllocator; +typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator; + +struct hypre_DeviceData { #if defined(HYPRE_USING_CURAND) curandGenerator_t curand_generator; @@ -135,9 +195,11 @@ struct hypre_CudaData #endif #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) - cudaStream_t cuda_streams[HYPRE_MAX_NUM_STREAMS]; + cudaStream_t streams[HYPRE_MAX_NUM_STREAMS]; #elif defined(HYPRE_USING_HIP) - hipStream_t cuda_streams[HYPRE_MAX_NUM_STREAMS]; + hipStream_t streams[HYPRE_MAX_NUM_STREAMS]; +#elif defined(HYPRE_USING_SYCL) + sycl::queue* streams[HYPRE_MAX_NUM_STREAMS] = {NULL}; #endif #ifdef HYPRE_USING_DEVICE_POOL @@ -151,12 +213,16 @@ struct hypre_CudaData #ifdef HYPRE_USING_UMPIRE_DEVICE hypre_umpire_device_allocator umpire_device_allocator; #endif - HYPRE_Int cuda_device; +#if defined(HYPRE_USING_SYCL) + sycl::device device; +#else + HYPRE_Int device; +#endif /* by default, hypre puts GPU computations in this stream - * Do not be confused with the default (null) CUDA stream */ - HYPRE_Int cuda_compute_stream_num; - /* work space for hypre's CUDA reducer */ - void *cuda_reduce_buffer; + * Do not be confused with the default (null) stream */ + HYPRE_Int compute_stream_num; + /* work space for hypre's device reducer */ + void *reduce_buffer; /* the device buffers needed to do MPI communication for struct comm */ HYPRE_Complex* struct_comm_recv_buffer; HYPRE_Complex* struct_comm_send_buffer; @@ -173,53 +239,56 @@ struct hypre_CudaData HYPRE_Int use_gpu_rand; }; -#define hypre_CudaDataCubBinGrowth(data) ((data) -> cub_bin_growth) -#define hypre_CudaDataCubMinBin(data) ((data) -> cub_min_bin) -#define hypre_CudaDataCubMaxBin(data) ((data) -> cub_max_bin) -#define hypre_CudaDataCubMaxCachedBytes(data) ((data) -> cub_max_cached_bytes) -#define hypre_CudaDataCubDevAllocator(data) ((data) -> cub_dev_allocator) -#define hypre_CudaDataCubUvmAllocator(data) ((data) -> cub_uvm_allocator) -#define hypre_CudaDataCudaDevice(data) ((data) -> cuda_device) -#define hypre_CudaDataCudaComputeStreamNum(data) ((data) -> cuda_compute_stream_num) -#define hypre_CudaDataCudaReduceBuffer(data) ((data) -> cuda_reduce_buffer) -#define hypre_CudaDataStructCommRecvBuffer(data) ((data) -> struct_comm_recv_buffer) -#define hypre_CudaDataStructCommSendBuffer(data) ((data) -> struct_comm_send_buffer) -#define hypre_CudaDataStructCommRecvBufferSize(data) ((data) -> struct_comm_recv_buffer_size) -#define hypre_CudaDataStructCommSendBufferSize(data) ((data) -> struct_comm_send_buffer_size) -#define hypre_CudaDataSpgemmUseCusparse(data) ((data) -> spgemm_use_cusparse) -#define hypre_CudaDataSpgemmNumPasses(data) ((data) -> spgemm_num_passes) -#define hypre_CudaDataSpgemmRownnzEstimateMethod(data) ((data) -> spgemm_rownnz_estimate_method) -#define hypre_CudaDataSpgemmRownnzEstimateNsamples(data) ((data) -> spgemm_rownnz_estimate_nsamples) -#define hypre_CudaDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor) -#define hypre_CudaDataSpgemmHashType(data) ((data) -> spgemm_hash_type) -#define hypre_CudaDataUmpireDeviceAllocator(data) ((data) -> umpire_device_allocator) -#define hypre_CudaDataUseGpuRand(data) ((data) -> use_gpu_rand) - -hypre_CudaData* hypre_CudaDataCreate(); -void hypre_CudaDataDestroy(hypre_CudaData* data); +#define hypre_DeviceDataCubBinGrowth(data) ((data) -> cub_bin_growth) +#define hypre_DeviceDataCubMinBin(data) ((data) -> cub_min_bin) +#define hypre_DeviceDataCubMaxBin(data) ((data) -> cub_max_bin) +#define hypre_DeviceDataCubMaxCachedBytes(data) ((data) -> cub_max_cached_bytes) +#define hypre_DeviceDataCubDevAllocator(data) ((data) -> cub_dev_allocator) +#define hypre_DeviceDataCubUvmAllocator(data) ((data) -> cub_uvm_allocator) +#define hypre_DeviceDataDevice(data) ((data) -> device) +#define hypre_DeviceDataComputeStreamNum(data) ((data) -> compute_stream_num) +#define hypre_DeviceDataReduceBuffer(data) ((data) -> reduce_buffer) +#define hypre_DeviceDataStructCommRecvBuffer(data) ((data) -> struct_comm_recv_buffer) +#define hypre_DeviceDataStructCommSendBuffer(data) ((data) -> struct_comm_send_buffer) +#define hypre_DeviceDataStructCommRecvBufferSize(data) ((data) -> struct_comm_recv_buffer_size) +#define hypre_DeviceDataStructCommSendBufferSize(data) ((data) -> struct_comm_send_buffer_size) +#define hypre_DeviceDataSpgemmUseCusparse(data) ((data) -> spgemm_use_cusparse) +#define hypre_DeviceDataSpgemmNumPasses(data) ((data) -> spgemm_num_passes) +#define hypre_DeviceDataSpgemmRownnzEstimateMethod(data) ((data) -> spgemm_rownnz_estimate_method) +#define hypre_DeviceDataSpgemmRownnzEstimateNsamples(data) ((data) -> spgemm_rownnz_estimate_nsamples) +#define hypre_DeviceDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor) +#define hypre_DeviceDataSpgemmHashType(data) ((data) -> spgemm_hash_type) +#define hypre_DeviceDataUmpireDeviceAllocator(data) ((data) -> umpire_device_allocator) +#define hypre_DeviceDataUseGpuRand(data) ((data) -> use_gpu_rand) + +hypre_DeviceData* hypre_DeviceDataCreate(); +void hypre_DeviceDataDestroy(hypre_DeviceData* data); #if defined(HYPRE_USING_CURAND) -curandGenerator_t hypre_CudaDataCurandGenerator(hypre_CudaData *data); +curandGenerator_t hypre_DeviceDataCurandGenerator(hypre_DeviceData *data); #endif #if defined(HYPRE_USING_CUBLAS) -cublasHandle_t hypre_CudaDataCublasHandle(hypre_CudaData *data); +cublasHandle_t hypre_DeviceDataCublasHandle(hypre_DeviceData *data); #endif #if defined(HYPRE_USING_CUSPARSE) -cusparseHandle_t hypre_CudaDataCusparseHandle(hypre_CudaData *data); +cusparseHandle_t hypre_DeviceDataCusparseHandle(hypre_DeviceData *data); #endif #if defined(HYPRE_USING_ROCSPARSE) -rocsparse_handle hypre_CudaDataCusparseHandle(hypre_CudaData *data); +rocsparse_handle hypre_DeviceDataCusparseHandle(hypre_DeviceData *data); #endif #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) -cudaStream_t hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i); -cudaStream_t hypre_CudaDataCudaComputeStream(hypre_CudaData *data); +cudaStream_t hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i); +cudaStream_t hypre_DeviceDataComputeStream(hypre_DeviceData *data); #elif defined(HYPRE_USING_HIP) -hipStream_t hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i); -hipStream_t hypre_CudaDataCudaComputeStream(hypre_CudaData *data); +hipStream_t hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i); +hipStream_t hypre_DeviceDataComputeStream(hypre_DeviceData *data); +#elif defined(HYPRE_USING_SYCL) +sycl::queue* hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i); +sycl::queue* hypre_DeviceDataComputeStream(hypre_DeviceData *data); #endif // Data structure and accessor routines for Cuda Sparse Triangular Matrices @@ -303,7 +372,7 @@ using namespace thrust::placeholders; } \ else \ { \ - (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ + (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ } \ hypre_SyncCudaComputeStream(hypre_handle()); \ HYPRE_CUDA_CALL( cudaGetLastError() ); \ @@ -320,7 +389,7 @@ using namespace thrust::placeholders; } \ else \ { \ - (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ + (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ } \ hypre_SyncCudaComputeStream(hypre_handle()); \ HYPRE_HIP_CALL( hipGetLastError() ); \ @@ -340,7 +409,7 @@ using namespace thrust::placeholders; } \ else \ { \ - (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ + (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ } \ } @@ -353,26 +422,26 @@ using namespace thrust::placeholders; #if defined(HYPRE_USING_CUDA) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::cuda::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::cuda::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); #elif defined(HYPRE_USING_HIP) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::hip::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::hip::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); #endif // HYPRE_USING_CUDA #elif HYPRE_USING_DEVICE_POOL #if defined(HYPRE_USING_CUDA) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::cuda::par(*(hypre_HandleCubDevAllocator(hypre_handle()))).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::cuda::par(*(hypre_HandleCubDevAllocator(hypre_handle()))).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); #endif #else #if defined(HYPRE_USING_CUDA) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::cuda::par.on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::cuda::par.on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); #elif defined(HYPRE_USING_HIP) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::hip::par.on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::hip::par.on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); #endif // HYPRE_USING_CUDA #endif // HYPRE_USING_UMPIRE_DEVICE @@ -907,9 +976,9 @@ cudaError_t hypre_CachingFreeDevice(void *ptr); cudaError_t hypre_CachingFreeManaged(void *ptr); #endif -hypre_cub_CachingDeviceAllocator * hypre_CudaDataCubCachingAllocatorCreate(hypre_uint bin_growth, hypre_uint min_bin, hypre_uint max_bin, size_t max_cached_bytes, bool skip_cleanup, bool debug, bool use_managed_memory); +hypre_cub_CachingDeviceAllocator * hypre_DeviceDataCubCachingAllocatorCreate(hypre_uint bin_growth, hypre_uint min_bin, hypre_uint max_bin, size_t max_cached_bytes, bool skip_cleanup, bool debug, bool use_managed_memory); -void hypre_CudaDataCubCachingAllocatorDestroy(hypre_CudaData *data); +void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) diff --git a/src/utilities/general.c b/src/utilities/general.c index 11d747afad..9bafe21224 100644 --- a/src/utilities/general.c +++ b/src/utilities/general.c @@ -51,13 +51,14 @@ hypre_HandleCreate() #if defined(HYPRE_USING_GPU) hypre_HandleDefaultExecPolicy(hypre_handle_) = HYPRE_EXEC_DEVICE; hypre_HandleStructExecPolicy(hypre_handle_) = HYPRE_EXEC_DEVICE; - hypre_HandleCudaData(hypre_handle_) = hypre_CudaDataCreate(); + hypre_HandleDeviceData(hypre_handle_) = hypre_DeviceDataCreate(); #endif +// WM: temporarily set the default exec policy to host for sycl until more functionality is available #if defined(HYPRE_USING_SYCL) hypre_HandleDefaultExecPolicy(hypre_handle_) = HYPRE_EXEC_HOST; hypre_HandleStructExecPolicy(hypre_handle_) = HYPRE_EXEC_HOST; - hypre_HandleSyclData(hypre_handle_) = hypre_SyclDataCreate(); + hypre_HandleDeviceData(hypre_handle_) = hypre_DeviceDataCreate(); #endif return hypre_handle_; @@ -72,11 +73,7 @@ hypre_HandleDestroy(hypre_Handle *hypre_handle_) } #if defined(HYPRE_USING_GPU) - hypre_CudaDataDestroy(hypre_HandleCudaData(hypre_handle_)); -#endif - -#if defined(HYPRE_USING_SYCL) - hypre_SyclDataDestroy(hypre_HandleSyclData(hypre_handle_)); + hypre_DeviceDataDestroy(hypre_HandleDeviceData(hypre_handle_)); #endif hypre_TFree(hypre_handle_, HYPRE_MEMORY_HOST); @@ -101,78 +98,19 @@ hypre_SetDevice(hypre_int device_id, hypre_Handle *hypre_handle_) HYPRE_HIP_CALL( hipSetDevice(device_id) ); #endif -#if defined(HYPRE_USING_GPU) - if (hypre_handle_) - { - hypre_HandleCudaDevice(hypre_handle_) = device_id; - } -#endif - #if defined(HYPRE_USING_SYCL) - // WM: TODO - this ain't it... - hypre_int nDevices=0; - sycl::platform platform(sycl::gpu_selector{}); - auto const& gpu_devices = platform.get_devices(); - for (int i = 0; i < gpu_devices.size(); i++) - { - if (gpu_devices[i].is_gpu()) - { - if(gpu_devices[i].get_info() > 0) - { - auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices( - sycl::info::partition_affinity_domain::numa); - nDevices += subDevicesDomainNuma.size(); - } - else - { - nDevices++; - } - } - } - - if (device_id > nDevices) + /* sycl device set at construction of hypre_DeviceData object */ +#elif defined(HYPRE_USING_GPU) + if (hypre_handle_) { - // WM: debug - hypre_printf("device_id = %d, nDevices = %d\n", device_id, nDevices); - hypre_printf("ERROR: SYCL device-ID exceed the number of devices on-node... \n"); - } - - HYPRE_Int local_nDevices=0; - for (int i = 0; i < gpu_devices.size(); i++) - { - if (gpu_devices[i].is_gpu()) - { - // multi-tile GPUs - if (gpu_devices[i].get_info() > 0) - { - auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices( - sycl::info::partition_affinity_domain::numa); - for (const auto &tile : subDevicesDomainNuma) - { - if (local_nDevices == device_id) - { - hypre_HandleSyclDevice(hypre_handle_) = tile; - } - local_nDevices++; - } - } - // single-tile GPUs - else - { - if (local_nDevices == device_id) - { - hypre_HandleSyclDevice(hypre_handle_) = gpu_devices[i]; - } - local_nDevices++; - } - } + hypre_HandleDevice(hypre_handle_) = device_id; } #endif return hypre_error_flag; } -/* Note: it doesn't return device_id in hypre_Handle->hypre_CudaData, +/* Note: it doesn't return device_id in hypre_Handle->hypre_DeviceData, * calls API instead. But these two should match at all times */ HYPRE_Int @@ -191,7 +129,7 @@ hypre_GetDevice(hypre_int *device_id) #endif #if defined(HYPRE_USING_SYCL) - // WM: TODO + /* sycl device set at construction of hypre_DeviceData object */ #endif return hypre_error_flag; @@ -253,7 +191,15 @@ hypre_GetDeviceLastError() #endif #if defined(HYPRE_USING_SYCL) - // WM: TODO + try + { + hypre_HandleComputeStream(hypre_handle())->wait_and_throw(); + } + catch (sycl::exception const& e) + { + std::cout << "Caught synchronous SYCL exception:\n" + << e.what() << std::endl; + } #endif return hypre_error_flag; @@ -280,7 +226,7 @@ HYPRE_Init() _hypre_handle = hypre_HandleCreate(); } -#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL) +#if defined(HYPRE_USING_GPU) hypre_GetDeviceLastError(); /* Notice: the cudaStream created is specific to the device @@ -293,12 +239,7 @@ HYPRE_Init() /* To include the cost of creating streams/cudahandles in HYPRE_Init */ /* If not here, will be done at the first use */ -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - hypre_HandleCudaComputeStream(_hypre_handle); -#endif -#if defined(HYPRE_USING_SYCL) - hypre_HandleSyclComputeQueue(_hypre_handle); -#endif + hypre_HandleComputeStream(_hypre_handle); /* A separate stream for prefetching */ //hypre_HandleCudaPrefetchStream(_hypre_handle); diff --git a/src/utilities/handle.h b/src/utilities/handle.h index 2c2bccfcc8..3e4915fc01 100644 --- a/src/utilities/handle.h +++ b/src/utilities/handle.h @@ -14,11 +14,8 @@ #ifndef HYPRE_HANDLE_H #define HYPRE_HANDLE_H -struct hypre_CudaData; -typedef struct hypre_CudaData hypre_CudaData; - -struct hypre_SyclData; -typedef struct hypre_SyclData hypre_SyclData; +struct hypre_DeviceData; +typedef struct hypre_DeviceData hypre_DeviceData; typedef struct { @@ -27,7 +24,7 @@ typedef struct HYPRE_ExecutionPolicy default_exec_policy; HYPRE_ExecutionPolicy struct_exec_policy; #if defined(HYPRE_USING_GPU) - hypre_CudaData *cuda_data; + hypre_DeviceData *device_data; #endif #if defined(HYPRE_USING_UMPIRE) char umpire_device_pool_name[HYPRE_UMPIRE_POOL_NAME_MAX_LEN]; @@ -45,43 +42,39 @@ typedef struct HYPRE_Int own_umpire_pinned_pool; umpire_resourcemanager umpire_rm; #endif -#if defined(HYPRE_USING_SYCL) - hypre_SyclData *sycl_data; -#endif } hypre_Handle; /* accessor macros to hypre_Handle */ #define hypre_HandleMemoryLocation(hypre_handle) ((hypre_handle) -> memory_location) #define hypre_HandleDefaultExecPolicy(hypre_handle) ((hypre_handle) -> default_exec_policy) #define hypre_HandleStructExecPolicy(hypre_handle) ((hypre_handle) -> struct_exec_policy) -#define hypre_HandleCudaData(hypre_handle) ((hypre_handle) -> cuda_data) -#define hypre_HandleSyclData(hypre_handle) ((hypre_handle) -> sycl_data) +#define hypre_HandleDeviceData(hypre_handle) ((hypre_handle) -> device_data) -#define hypre_HandleCurandGenerator(hypre_handle) hypre_CudaDataCurandGenerator(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCublasHandle(hypre_handle) hypre_CudaDataCublasHandle(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCusparseHandle(hypre_handle) hypre_CudaDataCusparseHandle(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCudaComputeStream(hypre_handle) hypre_CudaDataCudaComputeStream(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCubBinGrowth(hypre_handle) hypre_CudaDataCubBinGrowth(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCubMinBin(hypre_handle) hypre_CudaDataCubMinBin(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCubMaxBin(hypre_handle) hypre_CudaDataCubMaxBin(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCubMaxCachedBytes(hypre_handle) hypre_CudaDataCubMaxCachedBytes(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCubDevAllocator(hypre_handle) hypre_CudaDataCubDevAllocator(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCubUvmAllocator(hypre_handle) hypre_CudaDataCubUvmAllocator(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCudaDevice(hypre_handle) hypre_CudaDataCudaDevice(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCudaComputeStreamNum(hypre_handle) hypre_CudaDataCudaComputeStreamNum(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleCudaReduceBuffer(hypre_handle) hypre_CudaDataCudaReduceBuffer(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleStructCommRecvBuffer(hypre_handle) hypre_CudaDataStructCommRecvBuffer(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleStructCommSendBuffer(hypre_handle) hypre_CudaDataStructCommSendBuffer(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleStructCommRecvBufferSize(hypre_handle) hypre_CudaDataStructCommRecvBufferSize(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleStructCommSendBufferSize(hypre_handle) hypre_CudaDataStructCommSendBufferSize(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleSpgemmUseCusparse(hypre_handle) hypre_CudaDataSpgemmUseCusparse(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleSpgemmNumPasses(hypre_handle) hypre_CudaDataSpgemmNumPasses(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle) hypre_CudaDataSpgemmRownnzEstimateMethod(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle) hypre_CudaDataSpgemmRownnzEstimateNsamples(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_CudaDataSpgemmRownnzEstimateMultFactor(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleSpgemmHashType(hypre_handle) hypre_CudaDataSpgemmHashType(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleUmpireDeviceAllocator(hypre_handle) hypre_CudaDataUmpireDeviceAllocator(hypre_HandleCudaData(hypre_handle)) -#define hypre_HandleUseGpuRand(hypre_handle) hypre_CudaDataUseGpuRand(hypre_HandleCudaData(hypre_handle)) +#define hypre_HandleCurandGenerator(hypre_handle) hypre_DeviceDataCurandGenerator(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCublasHandle(hypre_handle) hypre_DeviceDataCublasHandle(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCusparseHandle(hypre_handle) hypre_DeviceDataCusparseHandle(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleComputeStream(hypre_handle) hypre_DeviceDataComputeStream(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCubBinGrowth(hypre_handle) hypre_DeviceDataCubBinGrowth(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCubMinBin(hypre_handle) hypre_DeviceDataCubMinBin(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCubMaxBin(hypre_handle) hypre_DeviceDataCubMaxBin(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCubMaxCachedBytes(hypre_handle) hypre_DeviceDataCubMaxCachedBytes(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCubDevAllocator(hypre_handle) hypre_DeviceDataCubDevAllocator(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleCubUvmAllocator(hypre_handle) hypre_DeviceDataCubUvmAllocator(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleDevice(hypre_handle) hypre_DeviceDataDevice(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleComputeStreamNum(hypre_handle) hypre_DeviceDataComputeStreamNum(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleReduceBuffer(hypre_handle) hypre_DeviceDataReduceBuffer(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleStructCommRecvBuffer(hypre_handle) hypre_DeviceDataStructCommRecvBuffer(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleStructCommSendBuffer(hypre_handle) hypre_DeviceDataStructCommSendBuffer(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleStructCommRecvBufferSize(hypre_handle) hypre_DeviceDataStructCommRecvBufferSize(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleStructCommSendBufferSize(hypre_handle) hypre_DeviceDataStructCommSendBufferSize(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmUseCusparse(hypre_handle) hypre_DeviceDataSpgemmUseCusparse(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmNumPasses(hypre_handle) hypre_DeviceDataSpgemmNumPasses(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMethod(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateNsamples(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMultFactor(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmHashType(hypre_handle) hypre_DeviceDataSpgemmHashType(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleUmpireDeviceAllocator(hypre_handle) hypre_DeviceDataUmpireDeviceAllocator(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleUseGpuRand(hypre_handle) hypre_DeviceDataUseGpuRand(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleUmpireResourceMan(hypre_handle) ((hypre_handle) -> umpire_rm) #define hypre_HandleUmpireDevicePoolSize(hypre_handle) ((hypre_handle) -> umpire_device_pool_size) @@ -98,7 +91,4 @@ typedef struct #define hypre_HandleOwnUmpireHostPool(hypre_handle) ((hypre_handle) -> own_umpire_host_pool) #define hypre_HandleOwnUmpirePinnedPool(hypre_handle) ((hypre_handle) -> own_umpire_pinned_pool) -#define hypre_HandleSyclComputeQueue(hypre_handle) hypre_SyclDataSyclComputeQueue(hypre_HandleSyclData(hypre_handle)) -#define hypre_HandleSyclDevice(hypre_handle) hypre_SyclDataSyclDevice(hypre_HandleSyclData(hypre_handle)) -#define hypre_HandleSyclComputeQueueNum(hypre_handle) hypre_SyclDataSyclComputeQueueNum(hypre_HandleSyclData(hypre_handle)) #endif diff --git a/src/utilities/headers b/src/utilities/headers index 0c96b33fae..81f1301471 100755 --- a/src/utilities/headers +++ b/src/utilities/headers @@ -89,10 +89,9 @@ extern "C++" { #=========================================================================== cat umpire_allocator.h >> $INTERNAL_HEADER -cat cuda_utils.h >> $INTERNAL_HEADER -cat cuda_reducer.h >> $INTERNAL_HEADER +cat device_utils.h >> $INTERNAL_HEADER +cat device_reducer.h >> $INTERNAL_HEADER cat cub_allocator.h >> $INTERNAL_HEADER -cat sycl_utils.h >> $INTERNAL_HEADER #=========================================================================== # Include guards diff --git a/src/utilities/memory.c b/src/utilities/memory.c index 5dc5af7ea8..dfb8a2e939 100644 --- a/src/utilities/memory.c +++ b/src/utilities/memory.c @@ -78,7 +78,7 @@ hypre_DeviceMemset(void *ptr, HYPRE_Int value, size_t num) #endif #if defined(HYPRE_USING_SYCL) - (hypre_HandleSyclComputeQueue(hypre_handle()))->memset(ptr, value, num).wait(); + (hypre_HandleComputeStream(hypre_handle()))->memset(ptr, value, num).wait(); #endif } @@ -99,7 +99,7 @@ hypre_UnifiedMemset(void *ptr, HYPRE_Int value, size_t num) #endif #if defined(HYPRE_USING_SYCL) - (hypre_HandleSyclComputeQueue(hypre_handle()))->memset(ptr, value, num).wait(); + (hypre_HandleComputeStream(hypre_handle()))->memset(ptr, value, num).wait(); #endif } @@ -122,26 +122,26 @@ hypre_UnifiedMemPrefetch(void *ptr, size_t size, hypre_MemoryLocation location) #if defined(HYPRE_USING_DEVICE_OPENMP) if (location == hypre_MEMORY_DEVICE) { - HYPRE_CUDA_CALL( cudaMemPrefetchAsync(ptr, size, hypre_HandleCudaDevice(hypre_handle()), - hypre_HandleCudaComputeStream(hypre_handle())) ); + HYPRE_CUDA_CALL( cudaMemPrefetchAsync(ptr, size, hypre_HandleDevice(hypre_handle()), + hypre_HandleComputeStream(hypre_handle())) ); } else if (location == hypre_MEMORY_HOST) { HYPRE_CUDA_CALL( cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId, - hypre_HandleCudaComputeStream(hypre_handle())) ); + hypre_HandleComputeStream(hypre_handle())) ); } #endif #if defined(HYPRE_USING_CUDA) if (location == hypre_MEMORY_DEVICE) { - HYPRE_CUDA_CALL( cudaMemPrefetchAsync(ptr, size, hypre_HandleCudaDevice(hypre_handle()), - hypre_HandleCudaComputeStream(hypre_handle())) ); + HYPRE_CUDA_CALL( cudaMemPrefetchAsync(ptr, size, hypre_HandleDevice(hypre_handle()), + hypre_HandleComputeStream(hypre_handle())) ); } else if (location == hypre_MEMORY_HOST) { HYPRE_CUDA_CALL( cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId, - hypre_HandleCudaComputeStream(hypre_handle())) ); + hypre_HandleComputeStream(hypre_handle())) ); } #endif @@ -150,13 +150,13 @@ hypre_UnifiedMemPrefetch(void *ptr, size_t size, hypre_MemoryLocation location) /* *if (location == hypre_MEMORY_DEVICE) *{ - * HYPRE_HIP_CALL( hipMemPrefetchAsync(ptr, size, hypre_HandleCudaDevice(hypre_handle()), - * hypre_HandleCudaComputeStream(hypre_handle())) ); + * HYPRE_HIP_CALL( hipMemPrefetchAsync(ptr, size, hypre_HandleDevice(hypre_handle()), + * hypre_HandleComputeStream(hypre_handle())) ); *} *else if (location == hypre_MEMORY_HOST) *{ * HYPRE_CUDA_CALL( hipMemPrefetchAsync(ptr, size, cudaCpuDeviceId, - * hypre_HandleCudaComputeStream(hypre_handle())) ); + * hypre_HandleComputeStream(hypre_handle())) ); *} */ #endif @@ -228,7 +228,7 @@ hypre_DeviceMalloc(size_t size, HYPRE_Int zeroinit) #endif #if defined(HYPRE_USING_SYCL) - ptr = (void *)sycl::malloc_device(size, *(hypre_HandleSyclComputeQueue(hypre_handle()))); + ptr = (void *)sycl::malloc_device(size, *(hypre_HandleComputeStream(hypre_handle()))); #endif #endif /* #if defined(HYPRE_USING_UMPIRE_DEVICE) */ @@ -267,7 +267,7 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit) #endif #if defined(HYPRE_USING_SYCL) - ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleSyclComputeQueue(hypre_handle()))); + ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleComputeStream(hypre_handle()))); #endif #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */ @@ -308,7 +308,7 @@ hypre_HostPinnedMalloc(size_t size, HYPRE_Int zeroinit) #endif #if defined(HYPRE_USING_SYCL) - ptr = (void *)sycl::malloc_host(size, *(hypre_HandleSyclComputeQueue(hypre_handle()))); + ptr = (void *)sycl::malloc_host(size, *(hypre_HandleComputeStream(hypre_handle()))); #endif #endif /* #if defined(HYPRE_USING_UMPIRE_PINNED) */ @@ -405,7 +405,7 @@ hypre_DeviceFree(void *ptr) #endif #if defined(HYPRE_USING_SYCL) - sycl::free(ptr, *(hypre_HandleSyclComputeQueue(hypre_handle()))); + sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle()))); #endif #endif /* #if defined(HYPRE_USING_UMPIRE_DEVICE) */ @@ -435,7 +435,7 @@ hypre_UnifiedFree(void *ptr) #endif #if defined(HYPRE_USING_SYCL) - sycl::free(ptr, *(hypre_HandleSyclComputeQueue(hypre_handle()))); + sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle()))); #endif #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */ @@ -461,7 +461,7 @@ hypre_HostPinnedFree(void *ptr) #endif #if defined(HYPRE_USING_SYCL) - sycl::free(ptr, *(hypre_HandleSyclComputeQueue(hypre_handle()))); + sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle()))); #endif #endif /* #if defined(HYPRE_USING_UMPIRE_PINNED) */ @@ -516,7 +516,7 @@ static inline void hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_dst, hypre_MemoryLocation loc_src) { #if defined(HYPRE_USING_SYCL) - sycl::queue* q = hypre_HandleSyclComputeQueue(hypre_handle()); + sycl::queue* q = hypre_HandleComputeStream(hypre_handle()); #endif if (dst == NULL || src == NULL) @@ -718,7 +718,7 @@ hypre_GetExecPolicy1_core(hypre_MemoryLocation location) exec = HYPRE_EXEC_DEVICE; break; case hypre_MEMORY_UNIFIED : -#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL) +#if defined(HYPRE_USING_GPU) exec = hypre_HandleDefaultExecPolicy(hypre_handle()); #endif break; @@ -765,7 +765,7 @@ hypre_GetExecPolicy2_core(hypre_MemoryLocation location1, if (location1 == hypre_MEMORY_UNIFIED && location2 == hypre_MEMORY_UNIFIED) { -#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL) +#if defined(HYPRE_USING_GPU) exec = hypre_HandleDefaultExecPolicy(hypre_handle()); #endif } @@ -971,7 +971,7 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location) { HYPRE_Int ierr = 0; -#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL) +#if defined(HYPRE_USING_GPU) *memory_location = hypre_MEMORY_UNDEFINED; #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) @@ -1069,7 +1069,7 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location) #if defined(HYPRE_USING_SYCL) *memory_location = hypre_MEMORY_UNDEFINED; sycl::usm::alloc allocType; - allocType = sycl::get_pointer_type(ptr, (hypre_HandleSyclComputeQueue(hypre_handle()))->get_context()); + allocType = sycl::get_pointer_type(ptr, (hypre_HandleComputeStream(hypre_handle()))->get_context()); if (allocType == sycl::usm::alloc::unknown) { @@ -1089,7 +1089,7 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location) } #endif //HYPRE_USING_SYCL -#else /* #if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL) */ +#else /* #if defined(HYPRE_USING_GPU) */ *memory_location = hypre_MEMORY_HOST; #endif @@ -1390,7 +1390,7 @@ hypre_CachingMallocDevice(void **ptr, size_t nbytes) if (!hypre_HandleCubDevAllocator(hypre_handle())) { hypre_HandleCubDevAllocator(hypre_handle()) = - hypre_CudaDataCubCachingAllocatorCreate( hypre_HandleCubBinGrowth(hypre_handle()), + hypre_DeviceDataCubCachingAllocatorCreate( hypre_HandleCubBinGrowth(hypre_handle()), hypre_HandleCubMinBin(hypre_handle()), hypre_HandleCubMaxBin(hypre_handle()), hypre_HandleCubMaxCachedBytes(hypre_handle()), @@ -1414,7 +1414,7 @@ hypre_CachingMallocManaged(void **ptr, size_t nbytes) if (!hypre_HandleCubUvmAllocator(hypre_handle())) { hypre_HandleCubUvmAllocator(hypre_handle()) = - hypre_CudaDataCubCachingAllocatorCreate( hypre_HandleCubBinGrowth(hypre_handle()), + hypre_DeviceDataCubCachingAllocatorCreate( hypre_HandleCubBinGrowth(hypre_handle()), hypre_HandleCubMinBin(hypre_handle()), hypre_HandleCubMaxBin(hypre_handle()), hypre_HandleCubMaxCachedBytes(hypre_handle()), @@ -1433,7 +1433,7 @@ hypre_CachingFreeManaged(void *ptr) } hypre_cub_CachingDeviceAllocator * -hypre_CudaDataCubCachingAllocatorCreate(hypre_uint bin_growth, +hypre_DeviceDataCubCachingAllocatorCreate(hypre_uint bin_growth, hypre_uint min_bin, hypre_uint max_bin, size_t max_cached_bytes, @@ -1454,10 +1454,10 @@ hypre_CudaDataCubCachingAllocatorCreate(hypre_uint bin_growth, } void -hypre_CudaDataCubCachingAllocatorDestroy(hypre_CudaData *data) +hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data) { - delete hypre_CudaDataCubDevAllocator(data); - delete hypre_CudaDataCubUvmAllocator(data); + delete hypre_DeviceDataCubDevAllocator(data); + delete hypre_DeviceDataCubUvmAllocator(data); } #endif // #ifdef HYPRE_USING_DEVICE_POOL @@ -1532,7 +1532,7 @@ HYPRE_Int hypre_umpire_device_pooled_allocate(void **ptr, size_t nbytes) { hypre_Handle *handle = hypre_handle(); - const hypre_int device_id = hypre_HandleCudaDevice(handle); + const hypre_int device_id = hypre_HandleDevice(handle); char resource_name[16]; const char *pool_name = hypre_HandleUmpireDevicePoolName(handle); From bafa6c2890f746695ebb45940f3544051ba50368 Mon Sep 17 00:00:00 2001 From: Wayne Bradford Mitchell Date: Tue, 27 Jul 2021 14:38:14 -0700 Subject: [PATCH 03/44] Fix cuda compilation Quick fix for compilation --with-cuda. Ran some tests on lassen and quartz as well to make sure I didn't break the cuda or cpu versions. --- src/seq_mv/csr_spgemm_device.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/seq_mv/csr_spgemm_device.c b/src/seq_mv/csr_spgemm_device.c index ca7dff4256..60d871a5ac 100644 --- a/src/seq_mv/csr_spgemm_device.c +++ b/src/seq_mv/csr_spgemm_device.c @@ -116,7 +116,7 @@ hypre_CSRMatrixDeviceSpGemmSetRownnzEstimateMethod( HYPRE_Int value ) { if (value == 1 || value == 2 || value == 3) { - hypre_HandleCudaData(hypre_handle())->spgemm_rownnz_estimate_method = value; + hypre_HandleDeviceData(hypre_handle())->spgemm_rownnz_estimate_method = value; } else { @@ -129,7 +129,7 @@ hypre_CSRMatrixDeviceSpGemmSetRownnzEstimateMethod( HYPRE_Int value ) HYPRE_Int hypre_CSRMatrixDeviceSpGemmSetRownnzEstimateNSamples( HYPRE_Int value ) { - hypre_HandleCudaData(hypre_handle())->spgemm_rownnz_estimate_nsamples = value; + hypre_HandleDeviceData(hypre_handle())->spgemm_rownnz_estimate_nsamples = value; return 0; } @@ -139,7 +139,7 @@ hypre_CSRMatrixDeviceSpGemmSetRownnzEstimateMultFactor( HYPRE_Real value ) { if (value > 0.0) { - hypre_HandleCudaData(hypre_handle())->spgemm_rownnz_estimate_mult_factor = value; + hypre_HandleDeviceData(hypre_handle())->spgemm_rownnz_estimate_mult_factor = value; } else { @@ -154,7 +154,7 @@ hypre_CSRMatrixDeviceSpGemmSetHashType( char value ) { if (value == 'L' || value == 'Q' || value == 'D') { - hypre_HandleCudaData(hypre_handle())->spgemm_hash_type = value; + hypre_HandleDeviceData(hypre_handle())->spgemm_hash_type = value; } else { From c16315d685307f5e0908c19a0944cfcf68e2922d Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Wed, 28 Jul 2021 23:39:07 +0000 Subject: [PATCH 04/44] Choose default exec policy for matvec Modified csr matvec to choose the default execution policy instead of hard-coded device policy. This now passes tests and seems to run as expected using sycl unified memory and using host execution for everything. --- src/seq_mv/csr_matvec.c | 8 +- src/test/Makefile | 1 + src/test/TEST_ij/solvers.jobs | 165 +++++++++++++++++----------------- src/utilities/general.c | 13 ++- src/utilities/memory.c | 26 +++--- 5 files changed, 105 insertions(+), 108 deletions(-) diff --git a/src/seq_mv/csr_matvec.c b/src/seq_mv/csr_matvec.c index 38d2f1d244..90f57d44da 100644 --- a/src/seq_mv/csr_matvec.c +++ b/src/seq_mv/csr_matvec.c @@ -711,9 +711,7 @@ hypre_CSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, HYPRE_Int ierr = 0; #if defined(HYPRE_USING_GPU) - //HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); - //RL: TODO back to hypre_GetExecPolicy1 later - HYPRE_ExecutionPolicy exec = HYPRE_EXEC_DEVICE; + HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_CSRMatrixMemoryLocation(A) ); if (exec == HYPRE_EXEC_DEVICE) { ierr = hypre_CSRMatrixMatvecDevice(0, alpha, A, x, beta, b, y, offset); @@ -981,9 +979,7 @@ hypre_CSRMatrixMatvecT( HYPRE_Complex alpha, HYPRE_Int ierr = 0; #if defined(HYPRE_USING_GPU) - //HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); - //RL: TODO back to hypre_GetExecPolicy1 later - HYPRE_ExecutionPolicy exec = HYPRE_EXEC_DEVICE; + HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_CSRMatrixMemoryLocation(A) ); if (exec == HYPRE_EXEC_DEVICE) { ierr = hypre_CSRMatrixMatvecDevice(1, alpha, A, x, beta, y, y, 0 ); diff --git a/src/test/Makefile b/src/test/Makefile index 10c3ac32cf..5a4c606193 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -144,6 +144,7 @@ ij: ij.${OBJ_SUFFIX} @echo "Building" $@ "... " ${LINK_CC} -o $@ $< ${LFLAGS} +# WM: TODO: remove simple: simple.${OBJ_SUFFIX} @echo "Building" $@ "... " ${LINK_CC} -o $@ $< ${LFLAGS} diff --git a/src/test/TEST_ij/solvers.jobs b/src/test/TEST_ij/solvers.jobs index 11dfba0f37..1f1c68c56c 100755 --- a/src/test/TEST_ij/solvers.jobs +++ b/src/test/TEST_ij/solvers.jobs @@ -29,63 +29,64 @@ # 60: DS_FlexGMRES # #============================================================================= +# WM: TODO remove -exec_host -mpirun -np 2 ./ij -solver 1 -rhsrand > solvers.out.0 -mpirun -np 2 ./ij -solver 2 -rhsrand > solvers.out.1 -mpirun -np 2 ./ij -solver 3 -rhsrand > solvers.out.2 -mpirun -np 2 ./ij -solver 4 -rhsrand > solvers.out.3 -mpirun -np 2 ./ij -solver 5 -rhsrand -w 0.67 -ns 2 > solvers.out.4 -mpirun -np 2 ./ij -solver 6 -rhsrand > solvers.out.5 -#mpirun -np 2 ./ij -solver 7 -rhsrand > solvers.out.6 -#mpirun -np 2 ./ij -solver 8 -rhsrand > solvers.out.7 -mpirun -np 2 ./ij -solver 20 -rhsrand > solvers.out.8 -mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand > solvers.out.9 -mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand -solver_type 2 > solvers.out.10 -mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand -solver_type 3 > solvers.out.11 -mpirun -np 2 ./ij -solver 16 -rhsrand > solvers.out.12 -mpirun -np 2 ./ij -solver 17 -rhsrand > solvers.out.13 -mpirun -np 2 ./ij -solver 17 -rhsrand -cgs 2 > solvers.out.14 -mpirun -np 2 ./ij -solver 17 -rhsrand -cgs 2 -unroll 8 > solvers.out.15 -mpirun -np 2 ./ij -solver 17 -rhsrand -unroll 4 > solvers.out.16 -mpirun -np 2 ./ij -solver 3 -rhsrand -check_residual > solvers.out.17 -mpirun -np 2 ./ij -solver 4 -rhsrand -check_residual > solvers.out.18 +mpirun -np 2 ./ij -exec_host -solver 1 -rhsrand > solvers.out.0 +mpirun -np 2 ./ij -exec_host -solver 2 -rhsrand > solvers.out.1 +mpirun -np 2 ./ij -exec_host -solver 3 -rhsrand > solvers.out.2 +mpirun -np 2 ./ij -exec_host -solver 4 -rhsrand > solvers.out.3 +mpirun -np 2 ./ij -exec_host -solver 5 -rhsrand -w 0.67 -ns 2 > solvers.out.4 +mpirun -np 2 ./ij -exec_host -solver 6 -rhsrand > solvers.out.5 +#mpirun -np 2 ./ij -exec_host -solver 7 -rhsrand > solvers.out.6 +#mpirun -np 2 ./ij -exec_host -solver 8 -rhsrand > solvers.out.7 +mpirun -np 2 ./ij -exec_host -solver 20 -rhsrand > solvers.out.8 +mpirun -np 2 ./ij -exec_host -solver 20 -cf 0.5 -rhsrand > solvers.out.9 +mpirun -np 2 ./ij -exec_host -solver 20 -cf 0.5 -rhsrand -solver_type 2 > solvers.out.10 +mpirun -np 2 ./ij -exec_host -solver 20 -cf 0.5 -rhsrand -solver_type 3 > solvers.out.11 +mpirun -np 2 ./ij -exec_host -solver 16 -rhsrand > solvers.out.12 +mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand > solvers.out.13 +mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand -cgs 2 > solvers.out.14 +mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand -cgs 2 -unroll 8 > solvers.out.15 +mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand -unroll 4 > solvers.out.16 +mpirun -np 2 ./ij -exec_host -solver 3 -rhsrand -check_residual > solvers.out.17 +mpirun -np 2 ./ij -exec_host -solver 4 -rhsrand -check_residual > solvers.out.18 #systems AMG run ...unknown approach, hybrid approach, nodal approach -mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 > solvers.out.sysu -mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 > solvers.out.sysh -mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 > solvers.out.sysn +mpirun -np 2 ./ij -exec_host -n 20 20 20 -sysL 2 -nf 2 > solvers.out.sysu +mpirun -np 2 ./ij -exec_host -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 > solvers.out.sysh +mpirun -np 2 ./ij -exec_host -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 > solvers.out.sysn #LGMRS and FlexGMRES -mpirun -np 2 ./ij -solver 50 -rhsrand > solvers.out.101 -mpirun -np 2 ./ij -solver 51 -rhsrand > solvers.out.102 -mpirun -np 2 ./ij -solver 60 -rhsrand > solvers.out.103 -mpirun -np 2 ./ij -solver 61 -rhsrand > solvers.out.104 +mpirun -np 2 ./ij -exec_host -solver 50 -rhsrand > solvers.out.101 +mpirun -np 2 ./ij -exec_host -solver 51 -rhsrand > solvers.out.102 +mpirun -np 2 ./ij -exec_host -solver 60 -rhsrand > solvers.out.103 +mpirun -np 2 ./ij -exec_host -solver 61 -rhsrand > solvers.out.104 #agglomerated coarse grid solve -mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 > solvers.out.105 -mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 > solvers.out.107 +mpirun -np 8 ./ij -exec_host -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 > solvers.out.105 +mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 > solvers.out.107 #redundant coarse grid solve -mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 > solvers.out.106 -mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 > solvers.out.108 +mpirun -np 8 ./ij -exec_host -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 > solvers.out.106 +mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 > solvers.out.108 #additive cycles -mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 > solvers.out.109 -mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add_w 0.7 -mult_add 0 > solvers.out.110 -mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 > solvers.out.111 -mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 > solvers.out.112 -mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 > solvers.out.113 -mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -add_end 2 > solvers.out.118 -mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 ns 2 > solvers.out.119 -mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -rlx 18 -ns 2 -rlx_coarse 18 -ns_coarse 2 > solvers.out.120 +mpirun -np 2 ./ij -exec_host -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 > solvers.out.109 +mpirun -np 2 ./ij -exec_host -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add_w 0.7 -mult_add 0 > solvers.out.110 +mpirun -np 4 ./ij -exec_host -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 > solvers.out.111 +mpirun -np 8 ./ij -exec_host -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 > solvers.out.112 +mpirun -np 8 ./ij -exec_host -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 > solvers.out.113 +mpirun -np 8 ./ij -exec_host -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -add_end 2 > solvers.out.118 +mpirun -np 8 ./ij -exec_host -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 ns 2 > solvers.out.119 +mpirun -np 8 ./ij -exec_host -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -rlx 18 -ns 2 -rlx_coarse 18 -ns_coarse 2 > solvers.out.120 #nonGalerkin version -mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114 -mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 > solvers.out.115 +mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114 +mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 > solvers.out.115 #RAP options -mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 > solvers.out.116 -mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117 +mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 > solvers.out.116 +mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117 # # MGR and MGR-PCG @@ -93,26 +94,26 @@ mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117 # coarse grid solver checks (1-level MGR == AMG (or coarse grid solver)) # Also checks for keeping coarse nodes to coarsest level # coarse grid size in output should be ~ mgr_num_reserved_nodes -mpirun -np 2 ./ij -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 0 > solvers.out.200 -mpirun -np 2 ./ij -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 100 > solvers.out.201 -mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 0 > solvers.out.202 -mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 100 > solvers.out.203 +mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 0 > solvers.out.200 +mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 100 > solvers.out.201 +mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 0 > solvers.out.202 +mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 100 > solvers.out.203 # multi level MGR tests with different coarse grid type strategies # Fix non C points to F points with different F-relaxation methods (single/multilevel F-relaxation) # with/ without reserved coarse nodes -mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.204 -mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.205 -mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.206 -mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.207 +mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.204 +mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.205 +mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.206 +mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.207 # Not fixed non C points to F points with different F-relaxation methods (single/multilevel F-relaxation) # with/ without reserved coarse nodes -mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.208 -mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.209 -mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.210 -mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.211 +mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.208 +mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.209 +mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.210 +mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.211 # MGR-PCG tests -mpirun -np 2 ./ij -solver 71 -mgr_nlevels 0 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.212 -mpirun -np 2 ./ij -solver 71 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.213 +mpirun -np 2 ./ij -exec_host -solver 71 -mgr_nlevels 0 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.212 +mpirun -np 2 ./ij -exec_host -solver 71 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.213 # # hypre_ILU tests @@ -121,39 +122,39 @@ mpirun -np 2 ./ij -solver 71 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_ # Tests ILU-(Flex)GMRES # Test AMG with ILU as a complex smoother # -mpirun -np 1 ./ij -solver 80 -ilu_type 0 -ilu_lfil 0 > solvers.out.300 -mpirun -np 1 ./ij -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.301 -mpirun -np 1 ./ij -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.302 +mpirun -np 1 ./ij -exec_host -solver 80 -ilu_type 0 -ilu_lfil 0 > solvers.out.300 +mpirun -np 1 ./ij -exec_host -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.301 +mpirun -np 1 ./ij -exec_host -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.302 # parallel ILU # BJ -mpirun -np 2 ./ij -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.303 -mpirun -np 2 ./ij -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.304 +mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.303 +mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.304 # GMRES+ILU -mpirun -np 2 ./ij -solver 80 -ilu_type 10 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.305 -mpirun -np 2 ./ij -solver 80 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.306 +mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 10 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.305 +mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.306 # NSH+ILU -mpirun -np 2 ./ij -solver 80 -ilu_type 20 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.307 -mpirun -np 2 ./ij -solver 80 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.308 +mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 20 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.307 +mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.308 # RAS+ILU -mpirun -np 2 ./ij -solver 80 -ilu_type 30 -ilu_lfil 1 > solvers.out.309 -mpirun -np 2 ./ij -solver 80 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.310 +mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 30 -ilu_lfil 1 > solvers.out.309 +mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.310 # ddPQ-GMRES+ILU -mpirun -np 2 ./ij -solver 80 -ilu_type 40 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.311 -mpirun -np 2 ./ij -solver 80 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.312 +mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 40 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.311 +mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.312 ## ILU-GMRES -mpirun -np 2 ./ij -solver 81 -ilu_type 0 -ilu_lfil 0 > solvers.out.313 -mpirun -np 2 ./ij -solver 81 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.314 -mpirun -np 2 ./ij -solver 81 -ilu_type 30 -ilu_lfil 0 > solvers.out.315 -mpirun -np 2 ./ij -solver 81 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.316 +mpirun -np 2 ./ij -exec_host -solver 81 -ilu_type 0 -ilu_lfil 0 > solvers.out.313 +mpirun -np 2 ./ij -exec_host -solver 81 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.314 +mpirun -np 2 ./ij -exec_host -solver 81 -ilu_type 30 -ilu_lfil 0 > solvers.out.315 +mpirun -np 2 ./ij -exec_host -solver 81 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.316 ## ILU-FlexGMRES -mpirun -np 2 ./ij -solver 82 -ilu_type 10 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.317 -mpirun -np 2 ./ij -solver 82 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.318 -mpirun -np 2 ./ij -solver 82 -ilu_type 20 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.319 -mpirun -np 2 ./ij -solver 82 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.320 -mpirun -np 2 ./ij -solver 82 -ilu_type 40 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.321 -mpirun -np 2 ./ij -solver 82 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.322 +mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 10 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.317 +mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.318 +mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 20 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.319 +mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.320 +mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 40 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.321 +mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.322 ## RAP-ILU -mpirun -np 2 ./ij -solver 82 -ilu_type 50 -ilu_lfil 0 > solvers.out.323 +mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 50 -ilu_lfil 0 > solvers.out.323 ## ILU smoother for AMG -mpirun -np 2 ./ij -solver 0 -smtype 5 -smlv 1 -ilu_type 30 > solvers.out.324 -mpirun -np 2 ./ij -solver 0 -smtype 15 -smlv 1 -ilu_type 30 > solvers.out.325 +mpirun -np 2 ./ij -exec_host -solver 0 -smtype 5 -smlv 1 -ilu_type 30 > solvers.out.324 +mpirun -np 2 ./ij -exec_host -solver 0 -smtype 15 -smlv 1 -ilu_type 30 > solvers.out.325 diff --git a/src/utilities/general.c b/src/utilities/general.c index 9bafe21224..ef413f3395 100644 --- a/src/utilities/general.c +++ b/src/utilities/general.c @@ -54,13 +54,6 @@ hypre_HandleCreate() hypre_HandleDeviceData(hypre_handle_) = hypre_DeviceDataCreate(); #endif -// WM: temporarily set the default exec policy to host for sycl until more functionality is available -#if defined(HYPRE_USING_SYCL) - hypre_HandleDefaultExecPolicy(hypre_handle_) = HYPRE_EXEC_HOST; - hypre_HandleStructExecPolicy(hypre_handle_) = HYPRE_EXEC_HOST; - hypre_HandleDeviceData(hypre_handle_) = hypre_DeviceDataCreate(); -#endif - return hypre_handle_; } @@ -76,7 +69,13 @@ hypre_HandleDestroy(hypre_Handle *hypre_handle_) hypre_DeviceDataDestroy(hypre_HandleDeviceData(hypre_handle_)); #endif +// WM: in debug mode, hypre_TFree() checks the pointer location, which requires the +// hypre_handle_'s compute queue if using sycl. But this was just destroyed above. +#if defined(HYPRE_DEBUG) && defined(HYPRE_USING_SYCL) + free(hypre_handle_); +#else hypre_TFree(hypre_handle_, HYPRE_MEMORY_HOST); +#endif return hypre_error_flag; } diff --git a/src/utilities/memory.c b/src/utilities/memory.c index dfb8a2e939..ece79f5d68 100644 --- a/src/utilities/memory.c +++ b/src/utilities/memory.c @@ -78,7 +78,7 @@ hypre_DeviceMemset(void *ptr, HYPRE_Int value, size_t num) #endif #if defined(HYPRE_USING_SYCL) - (hypre_HandleComputeStream(hypre_handle()))->memset(ptr, value, num).wait(); + HYPRE_SYCL_CALL( (hypre_HandleComputeStream(hypre_handle()))->memset(ptr, value, num).wait() ); #endif } @@ -99,7 +99,7 @@ hypre_UnifiedMemset(void *ptr, HYPRE_Int value, size_t num) #endif #if defined(HYPRE_USING_SYCL) - (hypre_HandleComputeStream(hypre_handle()))->memset(ptr, value, num).wait(); + HYPRE_SYCL_CALL( (hypre_HandleComputeStream(hypre_handle()))->memset(ptr, value, num).wait() ); #endif } @@ -267,7 +267,7 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit) #endif #if defined(HYPRE_USING_SYCL) - ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleComputeStream(hypre_handle()))); + HYPRE_SYCL_CALL( ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleComputeStream(hypre_handle()))) ); #endif #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */ @@ -308,7 +308,7 @@ hypre_HostPinnedMalloc(size_t size, HYPRE_Int zeroinit) #endif #if defined(HYPRE_USING_SYCL) - ptr = (void *)sycl::malloc_host(size, *(hypre_HandleComputeStream(hypre_handle()))); + HYPRE_SYCL_CALL( ptr = (void *)sycl::malloc_host(size, *(hypre_HandleComputeStream(hypre_handle()))) ); #endif #endif /* #if defined(HYPRE_USING_UMPIRE_PINNED) */ @@ -405,7 +405,7 @@ hypre_DeviceFree(void *ptr) #endif #if defined(HYPRE_USING_SYCL) - sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle()))); + HYPRE_SYCL_CALL( sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle()))) ); #endif #endif /* #if defined(HYPRE_USING_UMPIRE_DEVICE) */ @@ -435,7 +435,7 @@ hypre_UnifiedFree(void *ptr) #endif #if defined(HYPRE_USING_SYCL) - sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle()))); + HYPRE_SYCL_CALL( sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle()))) ); #endif #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */ @@ -461,7 +461,7 @@ hypre_HostPinnedFree(void *ptr) #endif #if defined(HYPRE_USING_SYCL) - sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle()))); + HYPRE_SYCL_CALL( sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle()))) ); #endif #endif /* #if defined(HYPRE_USING_UMPIRE_PINNED) */ @@ -566,7 +566,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds #endif #if defined(HYPRE_USING_SYCL) - q->memcpy(dst, src, size).wait(); + HYPRE_SYCL_CALL( q->memcpy(dst, src, size).wait() ); #endif return; } @@ -588,7 +588,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds #endif #if defined(HYPRE_USING_SYCL) - q->memcpy(dst, src, size).wait(); + HYPRE_SYCL_CALL( q->memcpy(dst, src, size).wait() ); #endif return; } @@ -610,7 +610,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds #endif #if defined(HYPRE_USING_SYCL) - q->memcpy(dst, src, size).wait(); + HYPRE_SYCL_CALL( q->memcpy(dst, src, size).wait() ); #endif return; } @@ -637,7 +637,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds #endif #if defined(HYPRE_USING_SYCL) - q->memcpy(dst, src, size).wait(); + HYPRE_SYCL_CALL( q->memcpy(dst, src, size).wait() ); #endif return; } @@ -664,7 +664,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds #endif #if defined(HYPRE_USING_SYCL) - q->memcpy(dst, src, size).wait(); + HYPRE_SYCL_CALL( q->memcpy(dst, src, size).wait() ); #endif return; } @@ -692,7 +692,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds #endif #if defined(HYPRE_USING_SYCL) - q->memcpy(dst, src, size).wait(); + HYPRE_SYCL_CALL( q->memcpy(dst, src, size).wait() ); #endif return; } From c58f9445b88189e67f4c3056e2f2363a42c08a8e Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Tue, 3 Aug 2021 01:25:19 +0000 Subject: [PATCH 05/44] Start boxloop implementation Starting to put in boxloop sycl code. This compiles, but crashes. --- src/struct_mv/_hypre_struct_mv.h | 2 +- src/struct_mv/_hypre_struct_mv.hpp | 484 +++++++++++++++++++++++++++++ src/struct_mv/boxloop_sycl.h | 482 ++++++++++++++++++++++++++++ src/struct_mv/headers | 9 +- src/utilities/_hypre_utilities.hpp | 126 +++++++- src/utilities/device_utils.c | 41 +++ src/utilities/device_utils.h | 126 +++++++- 7 files changed, 1260 insertions(+), 10 deletions(-) create mode 100644 src/struct_mv/boxloop_sycl.h diff --git a/src/struct_mv/_hypre_struct_mv.h b/src/struct_mv/_hypre_struct_mv.h index f95c4a74a0..70dbdf9f41 100644 --- a/src/struct_mv/_hypre_struct_mv.h +++ b/src/struct_mv/_hypre_struct_mv.h @@ -2484,7 +2484,7 @@ hypre__J = hypre__thread; i1 = i2 = 0; \ #endif -#elif !defined(HYPRE_USING_RAJA) && !defined(HYPRE_USING_KOKKOS) && !defined(HYPRE_USING_CUDA) && !defined(HYPRE_USING_HIP) +#elif !defined(HYPRE_USING_RAJA) && !defined(HYPRE_USING_KOKKOS) && !defined(HYPRE_USING_CUDA) && !defined(HYPRE_USING_HIP) && !defined(HYPRE_USING_SYCL) /****************************************************************************** * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other * HYPRE Project Developers. See the top-level COPYRIGHT file for details. diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index bf411411e6..d0674b2b0e 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1149,6 +1149,490 @@ else \ #endif /* #ifndef HYPRE_BOXLOOP_CUDA_HEADER */ +#elif defined(HYPRE_USING_SYCL) +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/****************************************************************************** + * + * Header info for the BoxLoop + * + *****************************************************************************/ + +/*-------------------------------------------------------------------------- + * BoxLoop macros: + *--------------------------------------------------------------------------*/ + +#ifndef HYPRE_BOXLOOP_SYCL_HEADER +#define HYPRE_BOXLOOP_SYCL_HEADER + +typedef struct hypre_Boxloop_struct +{ + HYPRE_Int lsize0,lsize1,lsize2; + HYPRE_Int strides0,strides1,strides2; + HYPRE_Int bstart0,bstart1,bstart2; + HYPRE_Int bsize0,bsize1,bsize2; +} hypre_Boxloop; + + + + +/********************************************************************* + * put this in _hypre_utilities.hpp ? + *********************************************************************/ +#define HYPRE_SYCL_1D_LAUNCH(kernel_name, gridsize, blocksize, ...) \ +{ \ + if ( gridsize[0] == 0 || blocksize[0] == 0 ) \ + { \ + hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \ + __FILE__, __LINE__, \ + gridsize[0], blocksize[0]); \ + assert(0); exit(1); \ + } \ + else \ + { \ + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) \ + { \ + cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), [=] (sycl::nd_item<1> item) \ + { (kernel_name)(item, __VA_ARGS__); } ); \ + }).wait_and_throw(); \ + } \ +} + + + +#ifdef __cplusplus +extern "C++" { +#endif + +/********************************************************************* + * forall function and kernel + *********************************************************************/ + +template +void +forall_kernel( sycl::nd_item<1> item, + LOOP_BODY loop_body, + HYPRE_Int length ) +{ + const HYPRE_Int idx = hypre_cuda_get_grid_thread_id<1>(item); + + if (idx < length) + { + loop_body(idx); + } +} + +template +void +BoxLoopforall( LOOP_BODY loop_body, + HYPRE_Int length ) +{ + /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ + /* WM: TODO: uncomment above and remove below */ + HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; + + if (exec_policy == HYPRE_EXEC_HOST) + { +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for HYPRE_SMP_SCHEDULE +#endif + for (HYPRE_Int idx = 0; idx < length; idx++) + { + loop_body(idx); + } + } + else if (exec_policy == HYPRE_EXEC_DEVICE) + { + const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + + /* HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); */ + HYPRE_SYCL_1D_LAUNCH(forall_kernel, gDim, bDim, loop_body, length); + } +} + +#ifdef __cplusplus +} +#endif + +/********************************************************************* + * Init/Declare/IncK etc. + *********************************************************************/ + +/* Get 1-D length of the loop, in hypre__tot */ +#define hypre_newBoxLoopInit(ndim, loop_size) \ + HYPRE_Int hypre__tot = 1; \ + for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \ + { \ + hypre__tot *= loop_size[hypre_d]; \ + } + +/* Initialize struct for box-k */ +#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \ + hypre_Boxloop databox##k; \ + /* dim 0 */ \ + databox##k.lsize0 = loop_size[0]; \ + databox##k.strides0 = stride[0]; \ + databox##k.bstart0 = start[0] - dbox->imin[0]; \ + databox##k.bsize0 = dbox->imax[0] - dbox->imin[0]; \ + /* dim 1 */ \ + if (ndim > 1) \ + { \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = start[1] - dbox->imin[1]; \ + databox##k.bsize1 = dbox->imax[1] - dbox->imin[1]; \ + } \ + else \ + { \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ + } \ + /* dim 2 */ \ + if (ndim == 3) \ + { \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = start[2] - dbox->imin[2]; \ + databox##k.bsize2 = dbox->imax[2] - dbox->imin[2]; \ + } \ + else \ + { \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ + } + +/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */ +#define hypre_newBoxLoopDeclare(box) \ + hypre_Index local_idx; \ + HYPRE_Int idx_local = idx; \ + hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ + idx_local = idx_local / box.lsize0; \ + hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ + idx_local = idx_local / box.lsize1; \ + hypre_IndexD(local_idx, 2) = idx_local % box.lsize2; \ + +/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */ +#define hypre_BoxLoopIncK(k, box, hypre__i) \ + HYPRE_Int hypre_boxD##k = 1; \ + HYPRE_Int hypre__i = 0; \ + hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ + hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ + hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); + +/* /1* get 3-D local_idx into 'index' *1/ */ +/* #define hypre_BoxLoopGetIndex(index) \ */ +/* index[0] = hypre_IndexD(local_idx, 0); \ */ +/* index[1] = hypre_IndexD(local_idx, 1); \ */ +/* index[2] = hypre_IndexD(local_idx, 2); */ + + + +/* BoxLoop 1 */ +#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + BoxLoopforall( [=] (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); + +#define hypre_newBoxLoop1End(i1) \ + }, hypre__tot); \ +} + + + + + + + + + + + + + + + + + + + +/********************************************************************* + * HOST IMPLEMENTATION + *********************************************************************/ + +#ifdef HYPRE_USING_OPENMP +#define HYPRE_BOX_REDUCTION +#if defined(WIN32) && defined(_MSC_VER) +#define Pragma(x) __pragma(HYPRE_XSTR(x)) +#else +#define Pragma(x) _Pragma(HYPRE_XSTR(x)) +#endif +#define OMP0 Pragma(omp parallel for HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE) +#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE) +#else /* #ifdef HYPRE_USING_OPENMP */ +#define OMP0 +#define OMP1 +#endif /* #ifdef HYPRE_USING_OPENMP */ + +#define zypre_newBoxLoop0Begin(ndim, loop_size) \ +{ \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopInit(ndim, loop_size); \ + OMP1 \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + zypre_BoxLoopSet(); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + +#define zypre_newBoxLoop0End() \ + } \ + zypre_BoxLoopInc1(); \ + zypre_BoxLoopInc2(); \ + } \ + } \ +} + +#define zypre_newBoxLoop1Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1) \ +{ \ + HYPRE_Int i1; \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + OMP1 \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + HYPRE_Int i1; \ + zypre_BoxLoopSet(); \ + zypre_BoxLoopSetK(1, i1); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + +#define zypre_newBoxLoop1End(i1) \ + i1 += hypre__i0inc1; \ + } \ + zypre_BoxLoopInc1(); \ + i1 += hypre__ikinc1[hypre__d]; \ + zypre_BoxLoopInc2(); \ + } \ + } \ +} + + +#define zypre_newBoxLoop2Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) \ +{ \ + HYPRE_Int i1, i2; \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopDeclareK(2); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ + OMP1 \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + HYPRE_Int i1, i2; \ + zypre_BoxLoopSet(); \ + zypre_BoxLoopSetK(1, i1); \ + zypre_BoxLoopSetK(2, i2); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + +#define zypre_newBoxLoop2End(i1, i2) \ + i1 += hypre__i0inc1; \ + i2 += hypre__i0inc2; \ + } \ + zypre_BoxLoopInc1(); \ + i1 += hypre__ikinc1[hypre__d]; \ + i2 += hypre__ikinc2[hypre__d]; \ + zypre_BoxLoopInc2(); \ + } \ + } \ +} + + +#define zypre_newBoxLoop3Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3) \ +{ \ + HYPRE_Int i1, i2, i3; \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopDeclareK(2); \ + zypre_BoxLoopDeclareK(3); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ + zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3); \ + OMP1 \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + HYPRE_Int i1, i2, i3; \ + zypre_BoxLoopSet(); \ + zypre_BoxLoopSetK(1, i1); \ + zypre_BoxLoopSetK(2, i2); \ + zypre_BoxLoopSetK(3, i3); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + +#define zypre_newBoxLoop3End(i1, i2, i3) \ + i1 += hypre__i0inc1; \ + i2 += hypre__i0inc2; \ + i3 += hypre__i0inc3; \ + } \ + zypre_BoxLoopInc1(); \ + i1 += hypre__ikinc1[hypre__d]; \ + i2 += hypre__ikinc2[hypre__d]; \ + i3 += hypre__ikinc3[hypre__d]; \ + zypre_BoxLoopInc2(); \ + } \ + } \ +} + +#define zypre_newBoxLoop4Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3, \ + dbox4, start4, stride4, i4) \ +{ \ + HYPRE_Int i1, i2, i3, i4; \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopDeclareK(2); \ + zypre_BoxLoopDeclareK(3); \ + zypre_BoxLoopDeclareK(4); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ + zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3); \ + zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4); \ + OMP1 \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + HYPRE_Int i1, i2, i3, i4; \ + zypre_BoxLoopSet(); \ + zypre_BoxLoopSetK(1, i1); \ + zypre_BoxLoopSetK(2, i2); \ + zypre_BoxLoopSetK(3, i3); \ + zypre_BoxLoopSetK(4, i4); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + +#define zypre_newBoxLoop4End(i1, i2, i3, i4) \ + i1 += hypre__i0inc1; \ + i2 += hypre__i0inc2; \ + i3 += hypre__i0inc3; \ + i4 += hypre__i0inc4; \ + } \ + zypre_BoxLoopInc1(); \ + i1 += hypre__ikinc1[hypre__d]; \ + i2 += hypre__ikinc2[hypre__d]; \ + i3 += hypre__ikinc3[hypre__d]; \ + i4 += hypre__ikinc4[hypre__d]; \ + zypre_BoxLoopInc2(); \ + } \ + } \ +} + +#define zypre_newBasicBoxLoop2Begin(ndim, loop_size, \ + stride1, i1, \ + stride2, i2) \ +{ \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopDeclareK(2); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BasicBoxLoopInitK(1, stride1); \ + zypre_BasicBoxLoopInitK(2, stride2); \ + OMP1 \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + HYPRE_Int i1, i2; \ + zypre_BoxLoopSet(); \ + zypre_BoxLoopSetK(1, i1); \ + zypre_BoxLoopSetK(2, i2); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + + +#define hypre_LoopBegin(size, idx) \ +{ \ + HYPRE_Int idx; \ + OMP0 \ + for (idx = 0; idx < size; idx ++) \ + { + +#define hypre_LoopEnd() \ + } \ +} + +#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex + +#define hypre_BoxLoopBlock zypre_BoxLoopBlock +#define hypre_BoxLoop0Begin zypre_newBoxLoop0Begin +#define hypre_BoxLoop0End zypre_newBoxLoop0End +/* #define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin */ +/* #define hypre_BoxLoop1End zypre_newBoxLoop1End */ +#define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin +#define hypre_BoxLoop1End hypre_newBoxLoop1End +#define hypre_BoxLoop2Begin zypre_newBoxLoop2Begin +#define hypre_BoxLoop2End zypre_newBoxLoop2End +#define hypre_BoxLoop3Begin zypre_newBoxLoop3Begin +#define hypre_BoxLoop3End zypre_newBoxLoop3End +#define hypre_BoxLoop4Begin zypre_newBoxLoop4Begin +#define hypre_BoxLoop4End zypre_newBoxLoop4End +#define hypre_BasicBoxLoop2Begin zypre_newBasicBoxLoop2Begin + +/* Reduction */ +#define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ + hypre_BoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) + +#define hypre_BoxLoop1ReductionEnd(i1, reducesum) \ + hypre_BoxLoop1End(i1) + +#define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, reducesum) \ + hypre_BoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) + +#define hypre_BoxLoop2ReductionEnd(i1, i2, reducesum) \ + hypre_BoxLoop2End(i1, i2) + +#endif + #endif #ifdef __cplusplus diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h new file mode 100644 index 0000000000..3874668ef8 --- /dev/null +++ b/src/struct_mv/boxloop_sycl.h @@ -0,0 +1,482 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/****************************************************************************** + * + * Header info for the BoxLoop + * + *****************************************************************************/ + +/*-------------------------------------------------------------------------- + * BoxLoop macros: + *--------------------------------------------------------------------------*/ + +#ifndef HYPRE_BOXLOOP_SYCL_HEADER +#define HYPRE_BOXLOOP_SYCL_HEADER + +typedef struct hypre_Boxloop_struct +{ + HYPRE_Int lsize0,lsize1,lsize2; + HYPRE_Int strides0,strides1,strides2; + HYPRE_Int bstart0,bstart1,bstart2; + HYPRE_Int bsize0,bsize1,bsize2; +} hypre_Boxloop; + + + + +/********************************************************************* + * put this in _hypre_utilities.hpp ? + *********************************************************************/ +#define HYPRE_SYCL_1D_LAUNCH(kernel_name, gridsize, blocksize, ...) \ +{ \ + if ( gridsize[0] == 0 || blocksize[0] == 0 ) \ + { \ + hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \ + __FILE__, __LINE__, \ + gridsize[0], blocksize[0]); \ + assert(0); exit(1); \ + } \ + else \ + { \ + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) \ + { \ + cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), [=] (sycl::nd_item<1> item) \ + { (kernel_name)(item, __VA_ARGS__); } ); \ + }).wait_and_throw(); \ + } \ +} + + + +#ifdef __cplusplus +extern "C++" { +#endif + +/********************************************************************* + * forall function and kernel + *********************************************************************/ + +template +void +forall_kernel( sycl::nd_item<1> item, + LOOP_BODY loop_body, + HYPRE_Int length ) +{ + const HYPRE_Int idx = hypre_cuda_get_grid_thread_id<1>(item); + + if (idx < length) + { + loop_body(idx); + } +} + +template +void +BoxLoopforall( LOOP_BODY loop_body, + HYPRE_Int length ) +{ + /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ + /* WM: TODO: uncomment above and remove below */ + HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; + + if (exec_policy == HYPRE_EXEC_HOST) + { +#ifdef HYPRE_USING_OPENMP +#pragma omp parallel for HYPRE_SMP_SCHEDULE +#endif + for (HYPRE_Int idx = 0; idx < length; idx++) + { + loop_body(idx); + } + } + else if (exec_policy == HYPRE_EXEC_DEVICE) + { + const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + + /* HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); */ + HYPRE_SYCL_1D_LAUNCH(forall_kernel, gDim, bDim, loop_body, length); + } +} + +#ifdef __cplusplus +} +#endif + +/********************************************************************* + * Init/Declare/IncK etc. + *********************************************************************/ + +/* Get 1-D length of the loop, in hypre__tot */ +#define hypre_newBoxLoopInit(ndim, loop_size) \ + HYPRE_Int hypre__tot = 1; \ + for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \ + { \ + hypre__tot *= loop_size[hypre_d]; \ + } + +/* Initialize struct for box-k */ +#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \ + hypre_Boxloop databox##k; \ + /* dim 0 */ \ + databox##k.lsize0 = loop_size[0]; \ + databox##k.strides0 = stride[0]; \ + databox##k.bstart0 = start[0] - dbox->imin[0]; \ + databox##k.bsize0 = dbox->imax[0] - dbox->imin[0]; \ + /* dim 1 */ \ + if (ndim > 1) \ + { \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = start[1] - dbox->imin[1]; \ + databox##k.bsize1 = dbox->imax[1] - dbox->imin[1]; \ + } \ + else \ + { \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ + } \ + /* dim 2 */ \ + if (ndim == 3) \ + { \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = start[2] - dbox->imin[2]; \ + databox##k.bsize2 = dbox->imax[2] - dbox->imin[2]; \ + } \ + else \ + { \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ + } + +/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */ +#define hypre_newBoxLoopDeclare(box) \ + hypre_Index local_idx; \ + HYPRE_Int idx_local = idx; \ + hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ + idx_local = idx_local / box.lsize0; \ + hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ + idx_local = idx_local / box.lsize1; \ + hypre_IndexD(local_idx, 2) = idx_local % box.lsize2; \ + +/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */ +#define hypre_BoxLoopIncK(k, box, hypre__i) \ + HYPRE_Int hypre_boxD##k = 1; \ + HYPRE_Int hypre__i = 0; \ + hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ + hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ + hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); + +/* /1* get 3-D local_idx into 'index' *1/ */ +/* #define hypre_BoxLoopGetIndex(index) \ */ +/* index[0] = hypre_IndexD(local_idx, 0); \ */ +/* index[1] = hypre_IndexD(local_idx, 1); \ */ +/* index[2] = hypre_IndexD(local_idx, 2); */ + + + +/* BoxLoop 1 */ +#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + BoxLoopforall( [=] (HYPRE_Int idx) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); + +#define hypre_newBoxLoop1End(i1) \ + }, hypre__tot); \ +} + + + + + + + + + + + + + + + + + + + +/********************************************************************* + * HOST IMPLEMENTATION + *********************************************************************/ + +#ifdef HYPRE_USING_OPENMP +#define HYPRE_BOX_REDUCTION +#if defined(WIN32) && defined(_MSC_VER) +#define Pragma(x) __pragma(HYPRE_XSTR(x)) +#else +#define Pragma(x) _Pragma(HYPRE_XSTR(x)) +#endif +#define OMP0 Pragma(omp parallel for HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE) +#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE) +#else /* #ifdef HYPRE_USING_OPENMP */ +#define OMP0 +#define OMP1 +#endif /* #ifdef HYPRE_USING_OPENMP */ + +#define zypre_newBoxLoop0Begin(ndim, loop_size) \ +{ \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopInit(ndim, loop_size); \ + OMP1 \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + zypre_BoxLoopSet(); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + +#define zypre_newBoxLoop0End() \ + } \ + zypre_BoxLoopInc1(); \ + zypre_BoxLoopInc2(); \ + } \ + } \ +} + +#define zypre_newBoxLoop1Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1) \ +{ \ + HYPRE_Int i1; \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + OMP1 \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + HYPRE_Int i1; \ + zypre_BoxLoopSet(); \ + zypre_BoxLoopSetK(1, i1); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + +#define zypre_newBoxLoop1End(i1) \ + i1 += hypre__i0inc1; \ + } \ + zypre_BoxLoopInc1(); \ + i1 += hypre__ikinc1[hypre__d]; \ + zypre_BoxLoopInc2(); \ + } \ + } \ +} + + +#define zypre_newBoxLoop2Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) \ +{ \ + HYPRE_Int i1, i2; \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopDeclareK(2); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ + OMP1 \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + HYPRE_Int i1, i2; \ + zypre_BoxLoopSet(); \ + zypre_BoxLoopSetK(1, i1); \ + zypre_BoxLoopSetK(2, i2); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + +#define zypre_newBoxLoop2End(i1, i2) \ + i1 += hypre__i0inc1; \ + i2 += hypre__i0inc2; \ + } \ + zypre_BoxLoopInc1(); \ + i1 += hypre__ikinc1[hypre__d]; \ + i2 += hypre__ikinc2[hypre__d]; \ + zypre_BoxLoopInc2(); \ + } \ + } \ +} + + +#define zypre_newBoxLoop3Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3) \ +{ \ + HYPRE_Int i1, i2, i3; \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopDeclareK(2); \ + zypre_BoxLoopDeclareK(3); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ + zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3); \ + OMP1 \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + HYPRE_Int i1, i2, i3; \ + zypre_BoxLoopSet(); \ + zypre_BoxLoopSetK(1, i1); \ + zypre_BoxLoopSetK(2, i2); \ + zypre_BoxLoopSetK(3, i3); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + +#define zypre_newBoxLoop3End(i1, i2, i3) \ + i1 += hypre__i0inc1; \ + i2 += hypre__i0inc2; \ + i3 += hypre__i0inc3; \ + } \ + zypre_BoxLoopInc1(); \ + i1 += hypre__ikinc1[hypre__d]; \ + i2 += hypre__ikinc2[hypre__d]; \ + i3 += hypre__ikinc3[hypre__d]; \ + zypre_BoxLoopInc2(); \ + } \ + } \ +} + +#define zypre_newBoxLoop4Begin(ndim, loop_size, \ + dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3, \ + dbox4, start4, stride4, i4) \ +{ \ + HYPRE_Int i1, i2, i3, i4; \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopDeclareK(2); \ + zypre_BoxLoopDeclareK(3); \ + zypre_BoxLoopDeclareK(4); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ + zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ + zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3); \ + zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4); \ + OMP1 \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + HYPRE_Int i1, i2, i3, i4; \ + zypre_BoxLoopSet(); \ + zypre_BoxLoopSetK(1, i1); \ + zypre_BoxLoopSetK(2, i2); \ + zypre_BoxLoopSetK(3, i3); \ + zypre_BoxLoopSetK(4, i4); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + +#define zypre_newBoxLoop4End(i1, i2, i3, i4) \ + i1 += hypre__i0inc1; \ + i2 += hypre__i0inc2; \ + i3 += hypre__i0inc3; \ + i4 += hypre__i0inc4; \ + } \ + zypre_BoxLoopInc1(); \ + i1 += hypre__ikinc1[hypre__d]; \ + i2 += hypre__ikinc2[hypre__d]; \ + i3 += hypre__ikinc3[hypre__d]; \ + i4 += hypre__ikinc4[hypre__d]; \ + zypre_BoxLoopInc2(); \ + } \ + } \ +} + +#define zypre_newBasicBoxLoop2Begin(ndim, loop_size, \ + stride1, i1, \ + stride2, i2) \ +{ \ + zypre_BoxLoopDeclare(); \ + zypre_BoxLoopDeclareK(1); \ + zypre_BoxLoopDeclareK(2); \ + zypre_BoxLoopInit(ndim, loop_size); \ + zypre_BasicBoxLoopInitK(1, stride1); \ + zypre_BasicBoxLoopInitK(2, stride2); \ + OMP1 \ + for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ + { \ + HYPRE_Int i1, i2; \ + zypre_BoxLoopSet(); \ + zypre_BoxLoopSetK(1, i1); \ + zypre_BoxLoopSetK(2, i2); \ + for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ + { \ + for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ + { + + +#define hypre_LoopBegin(size, idx) \ +{ \ + HYPRE_Int idx; \ + OMP0 \ + for (idx = 0; idx < size; idx ++) \ + { + +#define hypre_LoopEnd() \ + } \ +} + +#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex + +#define hypre_BoxLoopBlock zypre_BoxLoopBlock +#define hypre_BoxLoop0Begin zypre_newBoxLoop0Begin +#define hypre_BoxLoop0End zypre_newBoxLoop0End +/* #define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin */ +/* #define hypre_BoxLoop1End zypre_newBoxLoop1End */ +#define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin +#define hypre_BoxLoop1End hypre_newBoxLoop1End +#define hypre_BoxLoop2Begin zypre_newBoxLoop2Begin +#define hypre_BoxLoop2End zypre_newBoxLoop2End +#define hypre_BoxLoop3Begin zypre_newBoxLoop3Begin +#define hypre_BoxLoop3End zypre_newBoxLoop3End +#define hypre_BoxLoop4Begin zypre_newBoxLoop4Begin +#define hypre_BoxLoop4End zypre_newBoxLoop4End +#define hypre_BasicBoxLoop2Begin zypre_newBasicBoxLoop2Begin + +/* Reduction */ +#define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ + hypre_BoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) + +#define hypre_BoxLoop1ReductionEnd(i1, reducesum) \ + hypre_BoxLoop1End(i1) + +#define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, reducesum) \ + hypre_BoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) + +#define hypre_BoxLoop2ReductionEnd(i1, i2, reducesum) \ + hypre_BoxLoop2End(i1, i2) + +#endif diff --git a/src/struct_mv/headers b/src/struct_mv/headers index 645f39c444..fa32575581 100755 --- a/src/struct_mv/headers +++ b/src/struct_mv/headers @@ -61,7 +61,7 @@ cat boxloop_omp_device.h >> $INTERNAL_HEADER cat >> $INTERNAL_HEADER <<@ -#elif !defined(HYPRE_USING_RAJA) && !defined(HYPRE_USING_KOKKOS) && !defined(HYPRE_USING_CUDA) && !defined(HYPRE_USING_HIP) +#elif !defined(HYPRE_USING_RAJA) && !defined(HYPRE_USING_KOKKOS) && !defined(HYPRE_USING_CUDA) && !defined(HYPRE_USING_HIP) && !defined(HYPRE_USING_SYCL) @ cat boxloop_host.h >> $INTERNAL_HEADER @@ -137,6 +137,13 @@ cat boxloop_cuda.h >> $INTERNAL_HEADER cat >> $INTERNAL_HEADER <<@ +#elif defined(HYPRE_USING_SYCL) +@ + +cat boxloop_sycl.h >> $INTERNAL_HEADER + +cat >> $INTERNAL_HEADER <<@ + #endif @ diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index baaf16d7df..8a6f05b937 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -70,8 +70,8 @@ struct hypre_umpire_device_allocator * SPDX-License-Identifier: (Apache-2.0 OR MIT) ******************************************************************************/ -#ifndef HYPRE_CUDA_UTILS_H -#define HYPRE_CUDA_UTILS_H +#ifndef HYPRE_DEVICE_UTILS_H +#define HYPRE_DEVICE_UTILS_H #if defined(HYPRE_USING_GPU) @@ -408,6 +408,125 @@ struct hypre_GpuMatData #endif //#if defined(HYPRE_USING_GPU) +#if defined(HYPRE_USING_SYCL) +/* return the number of work-items in current work-group */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_num_threads(sycl::nd_item& item) +{ + return item.get_group().get_local_linear_range(); +} + +/* return the flattened or linearlized work-item id in current work-group (not global)*/ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_thread_id(sycl::nd_item& item) +{ + return item.get_local_linear_id(); +} + +/* return the number of sub-groups in current work-group */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_num_warps(sycl::nd_item& item) +{ + return item.get_sub_group().get_group_range().get(0); +} + +/* return the sub_group id in work-group */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_warp_id(sycl::nd_item& item) +{ + return item.get_sub_group().get_group_linear_id(); +} + +/* return the work-item lane id in a sub_group */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_lane_id(sycl::nd_item& item) +{ + return hypre_cuda_get_thread_id(item) & (item.get_sub_group().get_local_range().get(0)-1); +} + +/* return the num of work_groups in nd_range */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_num_blocks(sycl::nd_item& item) +{ + // return item.get_group().get_group_linear_range(); // API available in SYCL 2020 + + switch (dim) + { + case 1: + return (item.get_group_range(0)); + case 2: + return (item.get_group_range(0) * item.get_group_range(1)); + case 3: + return (item.get_group_range(0) * item.get_group_range(1) * item.get_group_range(2)); + } + + return -1; +} + +/* return the flattened or linearlized work-group id in nd_range */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_block_id(sycl::nd_item& item) +{ + return item.get_group_linear_id(); +} + +/* return the number of work-items in global iteration space*/ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_grid_num_threads(sycl::nd_item& item) +{ + switch (dim) + { + case 1: + return (item.get_global_range(0)); + case 2: + return (item.get_global_range(0) * item.get_global_range(1)); + case 3: + return (item.get_global_range(0) * item.get_global_range(1) * item.get_global_range(2)); + } + + return -1; +} + +/* return the flattened work-item id in global iteration space */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_grid_thread_id(sycl::nd_item& item) +{ + return item.get_global_linear_id(); +} + +/* return the number of sub-groups in global iteration space */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_grid_num_warps(sycl::nd_item& item) +{ + return hypre_cuda_get_num_blocks(item) * hypre_cuda_get_num_warps(item); +} + +/* return the flattened sub-group id in global iteration space */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_grid_warp_id(sycl::nd_item& item) +{ + return hypre_cuda_get_block_id(item) * hypre_cuda_get_num_warps(item) + + hypre_cuda_get_warp_id(item); +} + +/* device_utils.c */ +sycl::range<1> hypre_GetDefaultCUDABlockDimension(); + +sycl::range<1> hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, sycl::range<1> bDim ); + +#endif // #if defined(HYPRE_USING_SYCL) + #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) #include @@ -532,7 +651,6 @@ using namespace thrust::placeholders; #endif // HYPRE_USING_UMPIRE_DEVICE - /* return the number of threads in block */ template static __device__ __forceinline__ @@ -1013,7 +1131,7 @@ struct equal : public thrust::unary_function -/* cuda_utils.c */ +/* device_utils.c */ dim3 hypre_GetDefaultCUDABlockDimension(); dim3 hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim ); diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index 3c79401f4e..f5dbdc07a1 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -8,6 +8,47 @@ #include "_hypre_utilities.h" #include "_hypre_utilities.hpp" +#if defined(HYPRE_USING_SYCL) +#include +// WM: TODO: verify +sycl::range<1> hypre_GetDefaultCUDABlockDimension() +{ + // 256 - max work group size for Gen9 + // 512 - max work group size for ATS + sycl::range<1> wgDim(64); + return wgDim; +} + +// WM: TODO: verify +sycl::range<1> hypre_GetDefaultCUDAGridDimension(HYPRE_Int n, + const char *granularity, + sycl::range<1> wgDim) +{ + HYPRE_Int num_WGs = 0; + HYPRE_Int num_workitems_per_WG = wgDim[0]; + + if (granularity[0] == 't') + { + num_WGs = (n + num_workitems_per_WG - 1) / num_workitems_per_WG; + } + else if (granularity[0] == 'w') + { + HYPRE_Int num_subgroups_per_block = num_workitems_per_WG >> HYPRE_WARP_BITSHIFT; + hypre_assert(num_subgroups_per_block * HYPRE_WARP_SIZE == num_workitems_per_WG); + num_WGs = (n + num_subgroups_per_block - 1) / num_subgroups_per_block; + } + else + { + hypre_printf("Error %s %d: Unknown granularity !\n", __FILE__, __LINE__); + hypre_assert(0); + } + + sycl::range<1> gDim(num_WGs); + + return gDim; +} +#endif + #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) /* diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index d24e321686..13a01520df 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -5,8 +5,8 @@ * SPDX-License-Identifier: (Apache-2.0 OR MIT) ******************************************************************************/ -#ifndef HYPRE_CUDA_UTILS_H -#define HYPRE_CUDA_UTILS_H +#ifndef HYPRE_DEVICE_UTILS_H +#define HYPRE_DEVICE_UTILS_H #if defined(HYPRE_USING_GPU) @@ -343,6 +343,125 @@ struct hypre_GpuMatData #endif //#if defined(HYPRE_USING_GPU) +#if defined(HYPRE_USING_SYCL) +/* return the number of work-items in current work-group */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_num_threads(sycl::nd_item& item) +{ + return item.get_group().get_local_linear_range(); +} + +/* return the flattened or linearlized work-item id in current work-group (not global)*/ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_thread_id(sycl::nd_item& item) +{ + return item.get_local_linear_id(); +} + +/* return the number of sub-groups in current work-group */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_num_warps(sycl::nd_item& item) +{ + return item.get_sub_group().get_group_range().get(0); +} + +/* return the sub_group id in work-group */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_warp_id(sycl::nd_item& item) +{ + return item.get_sub_group().get_group_linear_id(); +} + +/* return the work-item lane id in a sub_group */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_lane_id(sycl::nd_item& item) +{ + return hypre_cuda_get_thread_id(item) & (item.get_sub_group().get_local_range().get(0)-1); +} + +/* return the num of work_groups in nd_range */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_num_blocks(sycl::nd_item& item) +{ + // return item.get_group().get_group_linear_range(); // API available in SYCL 2020 + + switch (dim) + { + case 1: + return (item.get_group_range(0)); + case 2: + return (item.get_group_range(0) * item.get_group_range(1)); + case 3: + return (item.get_group_range(0) * item.get_group_range(1) * item.get_group_range(2)); + } + + return -1; +} + +/* return the flattened or linearlized work-group id in nd_range */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_block_id(sycl::nd_item& item) +{ + return item.get_group_linear_id(); +} + +/* return the number of work-items in global iteration space*/ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_grid_num_threads(sycl::nd_item& item) +{ + switch (dim) + { + case 1: + return (item.get_global_range(0)); + case 2: + return (item.get_global_range(0) * item.get_global_range(1)); + case 3: + return (item.get_global_range(0) * item.get_global_range(1) * item.get_global_range(2)); + } + + return -1; +} + +/* return the flattened work-item id in global iteration space */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_grid_thread_id(sycl::nd_item& item) +{ + return item.get_global_linear_id(); +} + +/* return the number of sub-groups in global iteration space */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_grid_num_warps(sycl::nd_item& item) +{ + return hypre_cuda_get_num_blocks(item) * hypre_cuda_get_num_warps(item); +} + +/* return the flattened sub-group id in global iteration space */ +template +static __inline__ __attribute__((always_inline)) +hypre_int hypre_cuda_get_grid_warp_id(sycl::nd_item& item) +{ + return hypre_cuda_get_block_id(item) * hypre_cuda_get_num_warps(item) + + hypre_cuda_get_warp_id(item); +} + +/* device_utils.c */ +sycl::range<1> hypre_GetDefaultCUDABlockDimension(); + +sycl::range<1> hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, sycl::range<1> bDim ); + +#endif // #if defined(HYPRE_USING_SYCL) + #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) #include @@ -467,7 +586,6 @@ using namespace thrust::placeholders; #endif // HYPRE_USING_UMPIRE_DEVICE - /* return the number of threads in block */ template static __device__ __forceinline__ @@ -948,7 +1066,7 @@ struct equal : public thrust::unary_function -/* cuda_utils.c */ +/* device_utils.c */ dim3 hypre_GetDefaultCUDABlockDimension(); dim3 hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim ); From 25348d4e26108f963d60a1032d190fb9cad55aa7 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Thu, 16 Sep 2021 23:15:09 +0000 Subject: [PATCH 06/44] Remove nonfunctional code for fresh start --- src/struct_mv/_hypre_struct_mv.hpp | 181 +---------------------------- src/struct_mv/boxloop_sycl.h | 181 +---------------------------- 2 files changed, 4 insertions(+), 358 deletions(-) diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index d0674b2b0e..beaed26fda 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1181,181 +1181,6 @@ typedef struct hypre_Boxloop_struct -/********************************************************************* - * put this in _hypre_utilities.hpp ? - *********************************************************************/ -#define HYPRE_SYCL_1D_LAUNCH(kernel_name, gridsize, blocksize, ...) \ -{ \ - if ( gridsize[0] == 0 || blocksize[0] == 0 ) \ - { \ - hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \ - __FILE__, __LINE__, \ - gridsize[0], blocksize[0]); \ - assert(0); exit(1); \ - } \ - else \ - { \ - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) \ - { \ - cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), [=] (sycl::nd_item<1> item) \ - { (kernel_name)(item, __VA_ARGS__); } ); \ - }).wait_and_throw(); \ - } \ -} - - - -#ifdef __cplusplus -extern "C++" { -#endif - -/********************************************************************* - * forall function and kernel - *********************************************************************/ - -template -void -forall_kernel( sycl::nd_item<1> item, - LOOP_BODY loop_body, - HYPRE_Int length ) -{ - const HYPRE_Int idx = hypre_cuda_get_grid_thread_id<1>(item); - - if (idx < length) - { - loop_body(idx); - } -} - -template -void -BoxLoopforall( LOOP_BODY loop_body, - HYPRE_Int length ) -{ - /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ - /* WM: TODO: uncomment above and remove below */ - HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; - - if (exec_policy == HYPRE_EXEC_HOST) - { -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for HYPRE_SMP_SCHEDULE -#endif - for (HYPRE_Int idx = 0; idx < length; idx++) - { - loop_body(idx); - } - } - else if (exec_policy == HYPRE_EXEC_DEVICE) - { - const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); - const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); - - /* HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); */ - HYPRE_SYCL_1D_LAUNCH(forall_kernel, gDim, bDim, loop_body, length); - } -} - -#ifdef __cplusplus -} -#endif - -/********************************************************************* - * Init/Declare/IncK etc. - *********************************************************************/ - -/* Get 1-D length of the loop, in hypre__tot */ -#define hypre_newBoxLoopInit(ndim, loop_size) \ - HYPRE_Int hypre__tot = 1; \ - for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \ - { \ - hypre__tot *= loop_size[hypre_d]; \ - } - -/* Initialize struct for box-k */ -#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \ - hypre_Boxloop databox##k; \ - /* dim 0 */ \ - databox##k.lsize0 = loop_size[0]; \ - databox##k.strides0 = stride[0]; \ - databox##k.bstart0 = start[0] - dbox->imin[0]; \ - databox##k.bsize0 = dbox->imax[0] - dbox->imin[0]; \ - /* dim 1 */ \ - if (ndim > 1) \ - { \ - databox##k.lsize1 = loop_size[1]; \ - databox##k.strides1 = stride[1]; \ - databox##k.bstart1 = start[1] - dbox->imin[1]; \ - databox##k.bsize1 = dbox->imax[1] - dbox->imin[1]; \ - } \ - else \ - { \ - databox##k.lsize1 = 1; \ - databox##k.strides1 = 0; \ - databox##k.bstart1 = 0; \ - databox##k.bsize1 = 0; \ - } \ - /* dim 2 */ \ - if (ndim == 3) \ - { \ - databox##k.lsize2 = loop_size[2]; \ - databox##k.strides2 = stride[2]; \ - databox##k.bstart2 = start[2] - dbox->imin[2]; \ - databox##k.bsize2 = dbox->imax[2] - dbox->imin[2]; \ - } \ - else \ - { \ - databox##k.lsize2 = 1; \ - databox##k.strides2 = 0; \ - databox##k.bstart2 = 0; \ - databox##k.bsize2 = 0; \ - } - -/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */ -#define hypre_newBoxLoopDeclare(box) \ - hypre_Index local_idx; \ - HYPRE_Int idx_local = idx; \ - hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ - idx_local = idx_local / box.lsize0; \ - hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ - idx_local = idx_local / box.lsize1; \ - hypre_IndexD(local_idx, 2) = idx_local % box.lsize2; \ - -/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */ -#define hypre_BoxLoopIncK(k, box, hypre__i) \ - HYPRE_Int hypre_boxD##k = 1; \ - HYPRE_Int hypre__i = 0; \ - hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \ - hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ - hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \ - hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ - hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \ - hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); - -/* /1* get 3-D local_idx into 'index' *1/ */ -/* #define hypre_BoxLoopGetIndex(index) \ */ -/* index[0] = hypre_IndexD(local_idx, 0); \ */ -/* index[1] = hypre_IndexD(local_idx, 1); \ */ -/* index[2] = hypre_IndexD(local_idx, 2); */ - - - -/* BoxLoop 1 */ -#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ -{ \ - hypre_newBoxLoopInit(ndim, loop_size); \ - hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - BoxLoopforall( [=] (HYPRE_Int idx) \ - { \ - hypre_newBoxLoopDeclare(databox1); \ - hypre_BoxLoopIncK(1, databox1, i1); - -#define hypre_newBoxLoop1End(i1) \ - }, hypre__tot); \ -} - - - @@ -1604,10 +1429,8 @@ BoxLoopforall( LOOP_BODY loop_body, #define hypre_BoxLoopBlock zypre_BoxLoopBlock #define hypre_BoxLoop0Begin zypre_newBoxLoop0Begin #define hypre_BoxLoop0End zypre_newBoxLoop0End -/* #define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin */ -/* #define hypre_BoxLoop1End zypre_newBoxLoop1End */ -#define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin -#define hypre_BoxLoop1End hypre_newBoxLoop1End +#define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin +#define hypre_BoxLoop1End zypre_newBoxLoop1End #define hypre_BoxLoop2Begin zypre_newBoxLoop2Begin #define hypre_BoxLoop2End zypre_newBoxLoop2End #define hypre_BoxLoop3Begin zypre_newBoxLoop3Begin diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index 3874668ef8..0804d42fb7 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -29,181 +29,6 @@ typedef struct hypre_Boxloop_struct -/********************************************************************* - * put this in _hypre_utilities.hpp ? - *********************************************************************/ -#define HYPRE_SYCL_1D_LAUNCH(kernel_name, gridsize, blocksize, ...) \ -{ \ - if ( gridsize[0] == 0 || blocksize[0] == 0 ) \ - { \ - hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \ - __FILE__, __LINE__, \ - gridsize[0], blocksize[0]); \ - assert(0); exit(1); \ - } \ - else \ - { \ - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) \ - { \ - cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), [=] (sycl::nd_item<1> item) \ - { (kernel_name)(item, __VA_ARGS__); } ); \ - }).wait_and_throw(); \ - } \ -} - - - -#ifdef __cplusplus -extern "C++" { -#endif - -/********************************************************************* - * forall function and kernel - *********************************************************************/ - -template -void -forall_kernel( sycl::nd_item<1> item, - LOOP_BODY loop_body, - HYPRE_Int length ) -{ - const HYPRE_Int idx = hypre_cuda_get_grid_thread_id<1>(item); - - if (idx < length) - { - loop_body(idx); - } -} - -template -void -BoxLoopforall( LOOP_BODY loop_body, - HYPRE_Int length ) -{ - /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ - /* WM: TODO: uncomment above and remove below */ - HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; - - if (exec_policy == HYPRE_EXEC_HOST) - { -#ifdef HYPRE_USING_OPENMP -#pragma omp parallel for HYPRE_SMP_SCHEDULE -#endif - for (HYPRE_Int idx = 0; idx < length; idx++) - { - loop_body(idx); - } - } - else if (exec_policy == HYPRE_EXEC_DEVICE) - { - const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); - const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); - - /* HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); */ - HYPRE_SYCL_1D_LAUNCH(forall_kernel, gDim, bDim, loop_body, length); - } -} - -#ifdef __cplusplus -} -#endif - -/********************************************************************* - * Init/Declare/IncK etc. - *********************************************************************/ - -/* Get 1-D length of the loop, in hypre__tot */ -#define hypre_newBoxLoopInit(ndim, loop_size) \ - HYPRE_Int hypre__tot = 1; \ - for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \ - { \ - hypre__tot *= loop_size[hypre_d]; \ - } - -/* Initialize struct for box-k */ -#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \ - hypre_Boxloop databox##k; \ - /* dim 0 */ \ - databox##k.lsize0 = loop_size[0]; \ - databox##k.strides0 = stride[0]; \ - databox##k.bstart0 = start[0] - dbox->imin[0]; \ - databox##k.bsize0 = dbox->imax[0] - dbox->imin[0]; \ - /* dim 1 */ \ - if (ndim > 1) \ - { \ - databox##k.lsize1 = loop_size[1]; \ - databox##k.strides1 = stride[1]; \ - databox##k.bstart1 = start[1] - dbox->imin[1]; \ - databox##k.bsize1 = dbox->imax[1] - dbox->imin[1]; \ - } \ - else \ - { \ - databox##k.lsize1 = 1; \ - databox##k.strides1 = 0; \ - databox##k.bstart1 = 0; \ - databox##k.bsize1 = 0; \ - } \ - /* dim 2 */ \ - if (ndim == 3) \ - { \ - databox##k.lsize2 = loop_size[2]; \ - databox##k.strides2 = stride[2]; \ - databox##k.bstart2 = start[2] - dbox->imin[2]; \ - databox##k.bsize2 = dbox->imax[2] - dbox->imin[2]; \ - } \ - else \ - { \ - databox##k.lsize2 = 1; \ - databox##k.strides2 = 0; \ - databox##k.bstart2 = 0; \ - databox##k.bsize2 = 0; \ - } - -/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */ -#define hypre_newBoxLoopDeclare(box) \ - hypre_Index local_idx; \ - HYPRE_Int idx_local = idx; \ - hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ - idx_local = idx_local / box.lsize0; \ - hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ - idx_local = idx_local / box.lsize1; \ - hypre_IndexD(local_idx, 2) = idx_local % box.lsize2; \ - -/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */ -#define hypre_BoxLoopIncK(k, box, hypre__i) \ - HYPRE_Int hypre_boxD##k = 1; \ - HYPRE_Int hypre__i = 0; \ - hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \ - hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ - hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \ - hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ - hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \ - hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); - -/* /1* get 3-D local_idx into 'index' *1/ */ -/* #define hypre_BoxLoopGetIndex(index) \ */ -/* index[0] = hypre_IndexD(local_idx, 0); \ */ -/* index[1] = hypre_IndexD(local_idx, 1); \ */ -/* index[2] = hypre_IndexD(local_idx, 2); */ - - - -/* BoxLoop 1 */ -#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ -{ \ - hypre_newBoxLoopInit(ndim, loop_size); \ - hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - BoxLoopforall( [=] (HYPRE_Int idx) \ - { \ - hypre_newBoxLoopDeclare(databox1); \ - hypre_BoxLoopIncK(1, databox1, i1); - -#define hypre_newBoxLoop1End(i1) \ - }, hypre__tot); \ -} - - - @@ -452,10 +277,8 @@ BoxLoopforall( LOOP_BODY loop_body, #define hypre_BoxLoopBlock zypre_BoxLoopBlock #define hypre_BoxLoop0Begin zypre_newBoxLoop0Begin #define hypre_BoxLoop0End zypre_newBoxLoop0End -/* #define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin */ -/* #define hypre_BoxLoop1End zypre_newBoxLoop1End */ -#define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin -#define hypre_BoxLoop1End hypre_newBoxLoop1End +#define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin +#define hypre_BoxLoop1End zypre_newBoxLoop1End #define hypre_BoxLoop2Begin zypre_newBoxLoop2Begin #define hypre_BoxLoop2End zypre_newBoxLoop2End #define hypre_BoxLoop3Begin zypre_newBoxLoop3Begin From 58b6e23f9e8b724a9f026e49ae42d3bbf91e6165 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Thu, 16 Sep 2021 18:08:22 -0700 Subject: [PATCH 07/44] Add simple driver and remove problematic flag from configure --- src/configure | 2 +- src/test/simple.c | 623 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 624 insertions(+), 1 deletion(-) create mode 100644 src/test/simple.c diff --git a/src/configure b/src/configure index 328027a730..b3a0f72762 100755 --- a/src/configure +++ b/src/configure @@ -9094,7 +9094,7 @@ done SYCLCXXFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel " if test x"$hypre_using_debug" == x"yes"; then : - SYCLCXXFLAGS="-O0 -Wall -g -gdbx ${SYCLCXXFLAGS}" + SYCLCXXFLAGS="-O0 -Wall -g ${SYCLCXXFLAGS}" elif SYCLCXXFLAGS="-O2 ${SYCLCXXFLAGS}"; then : fi diff --git a/src/test/simple.c b/src/test/simple.c new file mode 100644 index 0000000000..0649fef677 --- /dev/null +++ b/src/test/simple.c @@ -0,0 +1,623 @@ +/* WM: todo - remove this file from git */ + +#include "_hypre_utilities.h" +#include "_hypre_utilities.hpp" +#include "HYPRE.h" +#include "_hypre_struct_mv.h" +#include "_hypre_struct_mv.hpp" + +HYPRE_Int AddValuesVector( hypre_StructGrid *gridvector, + hypre_StructVector *zvector, + HYPRE_Int *period, + HYPRE_Real value ) ; + + + +/********************************************************************* + * put this in _hypre_utilities.hpp ? + * WM: todo - if you can wrap the basic parallel_for call for use elsewhere... + *********************************************************************/ +/* #define HYPRE_SYCL_1D_LAUNCH(kernel_name, gridsize, blocksize, ...) \ */ +/* { \ */ +/* if ( gridsize[0] == 0 || blocksize[0] == 0 ) \ */ +/* { \ */ +/* hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \ */ +/* __FILE__, __LINE__, \ */ +/* gridsize[0], blocksize[0]); \ */ +/* assert(0); exit(1); \ */ +/* } \ */ +/* else \ */ +/* { \ */ +/* hypre_printf("WM: debug - inside BoxLoopforall(), submitting to queue\n"); \ */ +/* hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) \ */ +/* { \ */ +/* cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), [=] (sycl::nd_item<1> item) \ */ +/* { (kernel_name)(item, __VA_ARGS__); } ); \ */ +/* }).wait_and_throw(); \ */ +/* } \ */ +/* } */ + + + +#ifdef __cplusplus +extern "C++" { +#endif + +/********************************************************************* + * forall function + *********************************************************************/ + +template +void +BoxLoopforall( LOOP_BODY loop_body, + HYPRE_Int length ) +{ + /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ + /* WM: TODO: uncomment above and remove below */ + HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; + + if (exec_policy == HYPRE_EXEC_HOST) + { +/* WM: todo - is this really necessary, even? */ +/* #ifdef HYPRE_USING_OPENMP */ +/* #pragma omp parallel for HYPRE_SMP_SCHEDULE */ +/* #endif */ +/* for (HYPRE_Int idx = 0; idx < length; idx++) */ +/* { */ +/* loop_body(idx); */ +/* } */ + } + else if (exec_policy == HYPRE_EXEC_DEVICE) + { + const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) + { + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body); + }).wait_and_throw(); + } +} + +#ifdef __cplusplus +} +#endif + +/********************************************************************* + * Init/Declare/IncK etc. + *********************************************************************/ + +/* Get 1-D length of the loop, in hypre__tot */ +#define hypre_newBoxLoopInit(ndim, loop_size) \ + HYPRE_Int hypre__tot = 1; \ + for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \ + { \ + hypre__tot *= loop_size[hypre_d]; \ + } + +/* Initialize struct for box-k */ +#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \ + hypre_Boxloop databox##k; \ + /* dim 0 */ \ + databox##k.lsize0 = loop_size[0]; \ + databox##k.strides0 = stride[0]; \ + databox##k.bstart0 = start[0] - dbox->imin[0]; \ + databox##k.bsize0 = dbox->imax[0] - dbox->imin[0]; \ + /* dim 1 */ \ + if (ndim > 1) \ + { \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = start[1] - dbox->imin[1]; \ + databox##k.bsize1 = dbox->imax[1] - dbox->imin[1]; \ + } \ + else \ + { \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ + } \ + /* dim 2 */ \ + if (ndim == 3) \ + { \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = start[2] - dbox->imin[2]; \ + databox##k.bsize2 = dbox->imax[2] - dbox->imin[2]; \ + } \ + else \ + { \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ + } + +/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */ +#define hypre_newBoxLoopDeclare(box) \ + hypre_Index local_idx; \ + size_t idx_local = item.get_local_id(0); \ + hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ + idx_local = idx_local / box.lsize0; \ + hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ + idx_local = idx_local / box.lsize1; \ + hypre_IndexD(local_idx, 2) = idx_local % box.lsize2; \ + +/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */ +#define hypre_BoxLoopIncK(k, box, hypre__i) \ + HYPRE_Int hypre_boxD##k = 1; \ + HYPRE_Int hypre__i = 0; \ + hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ + hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ + hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); + + + +/* BoxLoop 1 */ +#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); + +#define hypre_newBoxLoop1End(i1) \ + }, hypre__tot); \ +} + +#define my_hypre_BoxLoop1Begin hypre_newBoxLoop1Begin +#define my_hypre_BoxLoop1End hypre_newBoxLoop1End + +HYPRE_Int +my_hypre_StructVectorSetConstantValues( hypre_StructVector *vector, + HYPRE_Complex values ) +{ + hypre_Box *v_data_box; + + HYPRE_Complex *vp; + + hypre_BoxArray *boxes; + hypre_Box *box; + hypre_Index loop_size; + hypre_IndexRef start; + hypre_Index unit_stride; + + HYPRE_Int i; + + /*----------------------------------------------------------------------- + * Set the vector coefficients + *-----------------------------------------------------------------------*/ + + hypre_SetIndex(unit_stride, 1); + + boxes = hypre_StructGridBoxes(hypre_StructVectorGrid(vector)); + hypre_ForBoxI(i, boxes) + { + box = hypre_BoxArrayBox(boxes, i); + start = hypre_BoxIMin(box); + + v_data_box = + hypre_BoxArrayBox(hypre_StructVectorDataSpace(vector), i); + vp = hypre_StructVectorBoxData(vector, i); + + hypre_BoxGetSize(box, loop_size); + + // WM: question - What's DEVICE_VAR? +#define DEVICE_VAR is_device_ptr(vp) + my_hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, + v_data_box, start, unit_stride, vi); + { + vp[vi] = values; + } + my_hypre_BoxLoop1End(vi); +#undef DEVICE_VAR + } + + return hypre_error_flag; +} + +HYPRE_Int +my_hypre_StructAxpy( HYPRE_Complex alpha, + hypre_StructVector *x, + hypre_StructVector *y ) +{ + hypre_Box *x_data_box; + hypre_Box *y_data_box; + + HYPRE_Complex *xp; + HYPRE_Complex *yp; + + hypre_BoxArray *boxes; + hypre_Box *box; + hypre_Index loop_size; + hypre_IndexRef start; + hypre_Index unit_stride; + + HYPRE_Int i; + + hypre_SetIndex(unit_stride, 1); + + boxes = hypre_StructGridBoxes(hypre_StructVectorGrid(y)); + hypre_ForBoxI(i, boxes) + { + box = hypre_BoxArrayBox(boxes, i); + start = hypre_BoxIMin(box); + + x_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(x), i); + y_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(y), i); + + xp = hypre_StructVectorBoxData(x, i); + yp = hypre_StructVectorBoxData(y, i); + + hypre_BoxGetSize(box, loop_size); + +/* WM: what is the DEVICE_VAR thing? */ +#define DEVICE_VAR is_device_ptr(yp,xp) + /* WM: todo */ + /* my_hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size, */ + /* x_data_box, start, unit_stride, xi, */ + /* y_data_box, start, unit_stride, yi); */ + /* { */ + /* yp[yi] += alpha * xp[xi]; */ + /* } */ + /* my_hypre_BoxLoop2End(xi, yi); */ +#undef DEVICE_VAR + } + + return hypre_error_flag; +} + + +/**************************** + * main + ****************************/ + +hypre_int +main( hypre_int argc, + char *argv[] ) +{ + /* variables */ + HYPRE_Int i, ix, iy, iz, ib; + HYPRE_Int p, q, r; + HYPRE_Int nx, ny, nz; + HYPRE_Int bx, by, bz; + HYPRE_Int nblocks; + HYPRE_Int dim; + HYPRE_Int sym; + HYPRE_Int **offsets; + HYPRE_Int **iupper; + HYPRE_Int **ilower; + HYPRE_Int periodic[3]; + HYPRE_Int istart[3]; + HYPRE_StructGrid grid; + HYPRE_StructVector b; + HYPRE_StructVector x; + HYPRE_Int num_ghost[6] = {0, 0, 0, 0, 0, 0}; + + dim = 1; + sym = 1; + nx = 1000; + ny = 1; + nz = 1; + bx = 1; + by = 1; + bz = 1; + p = 1; + q = 1; + r = 1; + periodic[0] = 0; + periodic[1] = 0; + periodic[2] = 0; + istart[0] = -3; + istart[1] = -3; + istart[2] = -3; + + for (i = 0; i < 2*dim; i++) + { + num_ghost[i] = 1; + } + + switch (dim) + { + case 1: + nblocks = bx; + if(sym) + { + offsets = hypre_CTAlloc(HYPRE_Int*, 2, HYPRE_MEMORY_HOST); + offsets[0] = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST); + offsets[0][0] = -1; + offsets[1] = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST); + offsets[1][0] = 0; + } + else + { + offsets = hypre_CTAlloc(HYPRE_Int*, 3, HYPRE_MEMORY_HOST); + offsets[0] = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST); + offsets[0][0] = -1; + offsets[1] = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST); + offsets[1][0] = 0; + offsets[2] = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST); + offsets[2][0] = 1; + } + break; + + case 2: + nblocks = bx*by; + if(sym) + { + offsets = hypre_CTAlloc(HYPRE_Int*, 3, HYPRE_MEMORY_HOST); + offsets[0] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); + offsets[0][0] = -1; + offsets[0][1] = 0; + offsets[1] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); + offsets[1][0] = 0; + offsets[1][1] = -1; + offsets[2] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); + offsets[2][0] = 0; + offsets[2][1] = 0; + } + else + { + offsets = hypre_CTAlloc(HYPRE_Int*, 5, HYPRE_MEMORY_HOST); + offsets[0] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); + offsets[0][0] = -1; + offsets[0][1] = 0; + offsets[1] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); + offsets[1][0] = 0; + offsets[1][1] = -1; + offsets[2] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); + offsets[2][0] = 0; + offsets[2][1] = 0; + offsets[3] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); + offsets[3][0] = 1; + offsets[3][1] = 0; + offsets[4] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); + offsets[4][0] = 0; + offsets[4][1] = 1; + } + break; + + case 3: + nblocks = bx*by*bz; + if(sym) + { + offsets = hypre_CTAlloc(HYPRE_Int*, 4, HYPRE_MEMORY_HOST); + offsets[0] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); + offsets[0][0] = -1; + offsets[0][1] = 0; + offsets[0][2] = 0; + offsets[1] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); + offsets[1][0] = 0; + offsets[1][1] = -1; + offsets[1][2] = 0; + offsets[2] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); + offsets[2][0] = 0; + offsets[2][1] = 0; + offsets[2][2] = -1; + offsets[3] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); + offsets[3][0] = 0; + offsets[3][1] = 0; + offsets[3][2] = 0; + } + else + { + offsets = hypre_CTAlloc(HYPRE_Int*, 7, HYPRE_MEMORY_HOST); + offsets[0] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); + offsets[0][0] = -1; + offsets[0][1] = 0; + offsets[0][2] = 0; + offsets[1] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); + offsets[1][0] = 0; + offsets[1][1] = -1; + offsets[1][2] = 0; + offsets[2] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); + offsets[2][0] = 0; + offsets[2][1] = 0; + offsets[2][2] = -1; + offsets[3] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); + offsets[3][0] = 0; + offsets[3][1] = 0; + offsets[3][2] = 0; + offsets[4] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); + offsets[4][0] = 1; + offsets[4][1] = 0; + offsets[4][2] = 0; + offsets[5] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); + offsets[5][0] = 0; + offsets[5][1] = 1; + offsets[5][2] = 0; + offsets[6] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); + offsets[6][0] = 0; + offsets[6][1] = 0; + offsets[6][2] = 1; + } + break; + } + + + + /* initialize */ + hypre_MPI_Init(&argc, &argv); + HYPRE_Init(); + + /* prepare space for the extents */ + ilower = hypre_CTAlloc(HYPRE_Int*, nblocks, HYPRE_MEMORY_HOST); + iupper = hypre_CTAlloc(HYPRE_Int*, nblocks, HYPRE_MEMORY_HOST); + for (i = 0; i < nblocks; i++) + { + ilower[i] = hypre_CTAlloc(HYPRE_Int, dim, HYPRE_MEMORY_HOST); + iupper[i] = hypre_CTAlloc(HYPRE_Int, dim, HYPRE_MEMORY_HOST); + } + + /* compute ilower and iupper from (p,q,r), (bx,by,bz), and (nx,ny,nz) */ + ib = 0; + switch (dim) + { + case 1: + for (ix = 0; ix < bx; ix++) + { + ilower[ib][0] = istart[0]+ nx*(bx*p+ix); + iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1; + ib++; + } + break; + case 2: + for (iy = 0; iy < by; iy++) + for (ix = 0; ix < bx; ix++) + { + ilower[ib][0] = istart[0]+ nx*(bx*p+ix); + iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1; + ilower[ib][1] = istart[1]+ ny*(by*q+iy); + iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1; + ib++; + } + break; + case 3: + for (iz = 0; iz < bz; iz++) + for (iy = 0; iy < by; iy++) + for (ix = 0; ix < bx; ix++) + { + ilower[ib][0] = istart[0]+ nx*(bx*p+ix); + iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1; + ilower[ib][1] = istart[1]+ ny*(by*q+iy); + iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1; + ilower[ib][2] = istart[2]+ nz*(bz*r+iz); + iupper[ib][2] = istart[2]+ nz*(bz*r+iz+1) - 1; + ib++; + } + break; + } + /* create grid */ + HYPRE_StructGridCreate(hypre_MPI_COMM_WORLD, dim, &grid); + for (ib = 0; ib < nblocks; ib++) + { + /* Add to the grid a new box defined by ilower[ib], iupper[ib]...*/ + HYPRE_StructGridSetExtents(grid, ilower[ib], iupper[ib]); + } + HYPRE_StructGridSetPeriodic(grid, periodic); + HYPRE_StructGridSetNumGhost(grid, num_ghost); + HYPRE_StructGridAssemble(grid); + + /* create struct vectors */ + HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &b); + HYPRE_StructVectorInitialize(b); + AddValuesVector(grid,b,periodic,1.0); + HYPRE_StructVectorAssemble(b); + + HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &x); + HYPRE_StructVectorInitialize(x); + AddValuesVector(grid,x,periodic,1.0); + HYPRE_StructVectorAssemble(x); + + + /* call set const */ + my_hypre_StructVectorSetConstantValues(x, 1.0); + + /* call axpy */ + /* my_hypre_StructAxpy(1.0, x, b); */ + + + + + + + + hypre_printf("DONE\n"); + return 0; +} + +HYPRE_Int +AddValuesVector( hypre_StructGrid *gridvector, + hypre_StructVector *zvector, + HYPRE_Int *period, + HYPRE_Real value ) +{ +/* #include "_hypre_struct_mv.h" */ + HYPRE_Int ierr = 0; + hypre_BoxArray *gridboxes; + HYPRE_Int ib; + hypre_IndexRef ilower; + hypre_IndexRef iupper; + hypre_Box *box; + HYPRE_Real *values; + HYPRE_Int volume,dim; +#if 0 //defined(HYPRE_USING_CUDA) + HYPRE_Int data_location = hypre_StructGridDataLocation(hypre_StructVectorGrid(zvector)); +#endif + + gridboxes = hypre_StructGridBoxes(gridvector); + dim = hypre_StructGridNDim(gridvector); + + ib=0; + hypre_ForBoxI(ib, gridboxes) + { + box = hypre_BoxArrayBox(gridboxes, ib); + volume = hypre_BoxVolume(box); +#if 0 //defined(HYPRE_USING_CUDA) + if (data_location != HYPRE_MEMORY_HOST) + { + values = hypre_CTAlloc(HYPRE_Real, volume,HYPRE_MEMORY_DEVICE); + } + else + { + values = hypre_CTAlloc(HYPRE_Real, volume,HYPRE_MEMORY_HOST); + } +#else + values = hypre_CTAlloc(HYPRE_Real, volume,HYPRE_MEMORY_DEVICE); +#endif + /*----------------------------------------------------------- + * For periodic b.c. in all directions, need rhs to satisfy + * compatibility condition. Achieved by setting a source and + * sink of equal strength. All other problems have rhs = 1. + *-----------------------------------------------------------*/ + +#define DEVICE_VAR is_device_ptr(values) + if ((dim == 2 && period[0] != 0 && period[1] != 0) || + (dim == 3 && period[0] != 0 && period[1] != 0 && period[2] != 0)) + { + hypre_LoopBegin(volume,i) + { + values[i] = 0.0; + values[0] = value; + values[volume - 1] = -value; + + } + hypre_LoopEnd() + } + else + { + hypre_LoopBegin(volume,i) + { + values[i] = value; + } + hypre_LoopEnd() + } +#undef DEVICE_VAR + + ilower = hypre_BoxIMin(box); + iupper = hypre_BoxIMax(box); + + HYPRE_StructVectorSetBoxValues(zvector, ilower, iupper, values); + +#if 0 //defined(HYPRE_USING_CUDA) + if (data_location != HYPRE_MEMORY_HOST) + { + hypre_TFree(values,HYPRE_MEMORY_DEVICE); + } + else + { + hypre_TFree(values,HYPRE_MEMORY_HOST); + } +#else + hypre_TFree(values,HYPRE_MEMORY_DEVICE); +#endif + } + + return ierr; +} From 0c58ebe19bbbb0e256fb6cb969a45d1c111d8318 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Mon, 27 Sep 2021 16:00:54 -0700 Subject: [PATCH 08/44] Reproducing invalid kernel name error in simple --- src/struct_mv/_hypre_struct_mv.hpp | 175 ++++++++++++++++++++++- src/struct_mv/boxloop_sycl.h | 175 ++++++++++++++++++++++- src/test/simple.c | 221 +++++++---------------------- src/utilities/device_utils.c | 15 +- 4 files changed, 406 insertions(+), 180 deletions(-) diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index beaed26fda..0cc8ba2619 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1179,6 +1179,169 @@ typedef struct hypre_Boxloop_struct } hypre_Boxloop; +#ifdef __cplusplus +extern "C++" { +#endif + +/********************************************************************* + * forall function + *********************************************************************/ + +template +void +BoxLoopforall( LOOP_BODY loop_body, + HYPRE_Int length ) +{ + /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ + /* WM: TODO: uncomment above and remove below */ + HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; + + if (exec_policy == HYPRE_EXEC_HOST) + { +/* WM: todo - is this really necessary, even? */ +/* #ifdef HYPRE_USING_OPENMP */ +/* #pragma omp parallel for HYPRE_SMP_SCHEDULE */ +/* #endif */ +/* for (HYPRE_Int idx = 0; idx < length; idx++) */ +/* { */ +/* loop_body(idx); */ +/* } */ + } + else if (exec_policy == HYPRE_EXEC_DEVICE) + { + const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) + { + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body); + }).wait_and_throw(); + } +} + +#ifdef __cplusplus +} +#endif + +/********************************************************************* + * Init/Declare/IncK etc. + *********************************************************************/ + +/* Get 1-D length of the loop, in hypre__tot */ +#define hypre_newBoxLoopInit(ndim, loop_size) \ + HYPRE_Int hypre__tot = 1; \ + for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \ + { \ + hypre__tot *= loop_size[hypre_d]; \ + } + +/* Initialize struct for box-k */ +#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \ + hypre_Boxloop databox##k; \ + databox##k.lsize0 = loop_size[0]; \ + databox##k.strides0 = stride[0]; \ + databox##k.bstart0 = start[0] - dbox->imin[0]; \ + databox##k.bsize0 = dbox->imax[0] - dbox->imin[0]; \ + if (ndim > 1) \ + { \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = start[1] - dbox->imin[1]; \ + databox##k.bsize1 = dbox->imax[1] - dbox->imin[1]; \ + } \ + else \ + { \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ + } \ + if (ndim == 3) \ + { \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = start[2] - dbox->imin[2]; \ + databox##k.bsize2 = dbox->imax[2] - dbox->imin[2]; \ + } \ + else \ + { \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ + } + +/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */ +/* WM: todo - double check that item.get_local_id(0) is actually what you want below */ +#define hypre_newBoxLoopDeclare(box) \ + hypre_Index local_idx; \ + size_t idx_local = item.get_local_id(0); \ + hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ + idx_local = idx_local / box.lsize0; \ + hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ + idx_local = idx_local / box.lsize1; \ + hypre_IndexD(local_idx, 2) = idx_local % box.lsize2; \ + +/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */ +#define hypre_BoxLoopIncK(k, box, hypre__i) \ + HYPRE_Int hypre_boxD##k = 1; \ + HYPRE_Int hypre__i = 0; \ + hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ + hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ + hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); + + + +/* BoxLoop 1 */ +/* #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ */ +/* { \ */ +/* hypre_newBoxLoopInit(ndim, loop_size); \ */ +/* hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ */ +/* hypre_printf("about to call BoxLoopfoall\n"); \ */ +/* BoxLoopforall( [=] (sycl::nd_item<1> item) \ */ +/* { \ */ +/* hypre_newBoxLoopDeclare(databox1); \ */ +/* hypre_BoxLoopIncK(1, databox1, i1); */ + +/* #define hypre_newBoxLoop1End(i1) \ */ +/* }, hypre__tot); \ */ +/* } */ + + + + + + +/* BoxLoop 1 */ +/* without the extra function call to BoxLoopforall */ +#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); \ + const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(hypre__tot, "thread", bDim); \ + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) \ + { \ + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), [=] (sycl::nd_item<1> item) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); + +#define hypre_newBoxLoop1End(i1) \ + }); \ + }).wait_and_throw(); \ +} + + + + + +#define my_hypre_BoxLoop1Begin hypre_newBoxLoop1Begin +#define my_hypre_BoxLoop1End hypre_newBoxLoop1End + @@ -1429,8 +1592,11 @@ typedef struct hypre_Boxloop_struct #define hypre_BoxLoopBlock zypre_BoxLoopBlock #define hypre_BoxLoop0Begin zypre_newBoxLoop0Begin #define hypre_BoxLoop0End zypre_newBoxLoop0End -#define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin -#define hypre_BoxLoop1End zypre_newBoxLoop1End +/* WM: replacing boxloops one at a time starting with boxloop1 */ +#define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin +#define hypre_BoxLoop1End hypre_newBoxLoop1End +/* #define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin */ +/* #define hypre_BoxLoop1End zypre_newBoxLoop1End */ #define hypre_BoxLoop2Begin zypre_newBoxLoop2Begin #define hypre_BoxLoop2End zypre_newBoxLoop2End #define hypre_BoxLoop3Begin zypre_newBoxLoop3Begin @@ -1440,11 +1606,12 @@ typedef struct hypre_Boxloop_struct #define hypre_BasicBoxLoop2Begin zypre_newBasicBoxLoop2Begin /* Reduction */ +/* WM: todo */ #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ - hypre_BoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) + zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) #define hypre_BoxLoop1ReductionEnd(i1, reducesum) \ - hypre_BoxLoop1End(i1) + zypre_newBoxLoop1End(i1) #define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ dbox2, start2, stride2, i2, reducesum) \ diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index 0804d42fb7..f4dbd9eb63 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -27,6 +27,169 @@ typedef struct hypre_Boxloop_struct } hypre_Boxloop; +#ifdef __cplusplus +extern "C++" { +#endif + +/********************************************************************* + * forall function + *********************************************************************/ + +template +void +BoxLoopforall( LOOP_BODY loop_body, + HYPRE_Int length ) +{ + /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ + /* WM: TODO: uncomment above and remove below */ + HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; + + if (exec_policy == HYPRE_EXEC_HOST) + { +/* WM: todo - is this really necessary, even? */ +/* #ifdef HYPRE_USING_OPENMP */ +/* #pragma omp parallel for HYPRE_SMP_SCHEDULE */ +/* #endif */ +/* for (HYPRE_Int idx = 0; idx < length; idx++) */ +/* { */ +/* loop_body(idx); */ +/* } */ + } + else if (exec_policy == HYPRE_EXEC_DEVICE) + { + const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) + { + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body); + }).wait_and_throw(); + } +} + +#ifdef __cplusplus +} +#endif + +/********************************************************************* + * Init/Declare/IncK etc. + *********************************************************************/ + +/* Get 1-D length of the loop, in hypre__tot */ +#define hypre_newBoxLoopInit(ndim, loop_size) \ + HYPRE_Int hypre__tot = 1; \ + for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \ + { \ + hypre__tot *= loop_size[hypre_d]; \ + } + +/* Initialize struct for box-k */ +#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \ + hypre_Boxloop databox##k; \ + databox##k.lsize0 = loop_size[0]; \ + databox##k.strides0 = stride[0]; \ + databox##k.bstart0 = start[0] - dbox->imin[0]; \ + databox##k.bsize0 = dbox->imax[0] - dbox->imin[0]; \ + if (ndim > 1) \ + { \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = start[1] - dbox->imin[1]; \ + databox##k.bsize1 = dbox->imax[1] - dbox->imin[1]; \ + } \ + else \ + { \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ + } \ + if (ndim == 3) \ + { \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = start[2] - dbox->imin[2]; \ + databox##k.bsize2 = dbox->imax[2] - dbox->imin[2]; \ + } \ + else \ + { \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ + } + +/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */ +/* WM: todo - double check that item.get_local_id(0) is actually what you want below */ +#define hypre_newBoxLoopDeclare(box) \ + hypre_Index local_idx; \ + size_t idx_local = item.get_local_id(0); \ + hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ + idx_local = idx_local / box.lsize0; \ + hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ + idx_local = idx_local / box.lsize1; \ + hypre_IndexD(local_idx, 2) = idx_local % box.lsize2; \ + +/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */ +#define hypre_BoxLoopIncK(k, box, hypre__i) \ + HYPRE_Int hypre_boxD##k = 1; \ + HYPRE_Int hypre__i = 0; \ + hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ + hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ + hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \ + hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); + + + +/* BoxLoop 1 */ +/* #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ */ +/* { \ */ +/* hypre_newBoxLoopInit(ndim, loop_size); \ */ +/* hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ */ +/* hypre_printf("about to call BoxLoopfoall\n"); \ */ +/* BoxLoopforall( [=] (sycl::nd_item<1> item) \ */ +/* { \ */ +/* hypre_newBoxLoopDeclare(databox1); \ */ +/* hypre_BoxLoopIncK(1, databox1, i1); */ + +/* #define hypre_newBoxLoop1End(i1) \ */ +/* }, hypre__tot); \ */ +/* } */ + + + + + + +/* BoxLoop 1 */ +/* without the extra function call to BoxLoopforall */ +#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); \ + const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(hypre__tot, "thread", bDim); \ + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) \ + { \ + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), [=] (sycl::nd_item<1> item) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); + +#define hypre_newBoxLoop1End(i1) \ + }); \ + }).wait_and_throw(); \ +} + + + + + +#define my_hypre_BoxLoop1Begin hypre_newBoxLoop1Begin +#define my_hypre_BoxLoop1End hypre_newBoxLoop1End + @@ -277,8 +440,11 @@ typedef struct hypre_Boxloop_struct #define hypre_BoxLoopBlock zypre_BoxLoopBlock #define hypre_BoxLoop0Begin zypre_newBoxLoop0Begin #define hypre_BoxLoop0End zypre_newBoxLoop0End -#define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin -#define hypre_BoxLoop1End zypre_newBoxLoop1End +/* WM: replacing boxloops one at a time starting with boxloop1 */ +#define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin +#define hypre_BoxLoop1End hypre_newBoxLoop1End +/* #define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin */ +/* #define hypre_BoxLoop1End zypre_newBoxLoop1End */ #define hypre_BoxLoop2Begin zypre_newBoxLoop2Begin #define hypre_BoxLoop2End zypre_newBoxLoop2End #define hypre_BoxLoop3Begin zypre_newBoxLoop3Begin @@ -288,11 +454,12 @@ typedef struct hypre_Boxloop_struct #define hypre_BasicBoxLoop2Begin zypre_newBasicBoxLoop2Begin /* Reduction */ +/* WM: todo */ #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ - hypre_BoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) + zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) #define hypre_BoxLoop1ReductionEnd(i1, reducesum) \ - hypre_BoxLoop1End(i1) + zypre_newBoxLoop1End(i1) #define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ dbox2, start2, stride2, i2, reducesum) \ diff --git a/src/test/simple.c b/src/test/simple.c index 0649fef677..e8a953beed 100644 --- a/src/test/simple.c +++ b/src/test/simple.c @@ -13,166 +13,6 @@ HYPRE_Int AddValuesVector( hypre_StructGrid *gridvector, -/********************************************************************* - * put this in _hypre_utilities.hpp ? - * WM: todo - if you can wrap the basic parallel_for call for use elsewhere... - *********************************************************************/ -/* #define HYPRE_SYCL_1D_LAUNCH(kernel_name, gridsize, blocksize, ...) \ */ -/* { \ */ -/* if ( gridsize[0] == 0 || blocksize[0] == 0 ) \ */ -/* { \ */ -/* hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \ */ -/* __FILE__, __LINE__, \ */ -/* gridsize[0], blocksize[0]); \ */ -/* assert(0); exit(1); \ */ -/* } \ */ -/* else \ */ -/* { \ */ -/* hypre_printf("WM: debug - inside BoxLoopforall(), submitting to queue\n"); \ */ -/* hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) \ */ -/* { \ */ -/* cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), [=] (sycl::nd_item<1> item) \ */ -/* { (kernel_name)(item, __VA_ARGS__); } ); \ */ -/* }).wait_and_throw(); \ */ -/* } \ */ -/* } */ - - - -#ifdef __cplusplus -extern "C++" { -#endif - -/********************************************************************* - * forall function - *********************************************************************/ - -template -void -BoxLoopforall( LOOP_BODY loop_body, - HYPRE_Int length ) -{ - /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ - /* WM: TODO: uncomment above and remove below */ - HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; - - if (exec_policy == HYPRE_EXEC_HOST) - { -/* WM: todo - is this really necessary, even? */ -/* #ifdef HYPRE_USING_OPENMP */ -/* #pragma omp parallel for HYPRE_SMP_SCHEDULE */ -/* #endif */ -/* for (HYPRE_Int idx = 0; idx < length; idx++) */ -/* { */ -/* loop_body(idx); */ -/* } */ - } - else if (exec_policy == HYPRE_EXEC_DEVICE) - { - const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); - const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); - - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) - { - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body); - }).wait_and_throw(); - } -} - -#ifdef __cplusplus -} -#endif - -/********************************************************************* - * Init/Declare/IncK etc. - *********************************************************************/ - -/* Get 1-D length of the loop, in hypre__tot */ -#define hypre_newBoxLoopInit(ndim, loop_size) \ - HYPRE_Int hypre__tot = 1; \ - for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \ - { \ - hypre__tot *= loop_size[hypre_d]; \ - } - -/* Initialize struct for box-k */ -#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \ - hypre_Boxloop databox##k; \ - /* dim 0 */ \ - databox##k.lsize0 = loop_size[0]; \ - databox##k.strides0 = stride[0]; \ - databox##k.bstart0 = start[0] - dbox->imin[0]; \ - databox##k.bsize0 = dbox->imax[0] - dbox->imin[0]; \ - /* dim 1 */ \ - if (ndim > 1) \ - { \ - databox##k.lsize1 = loop_size[1]; \ - databox##k.strides1 = stride[1]; \ - databox##k.bstart1 = start[1] - dbox->imin[1]; \ - databox##k.bsize1 = dbox->imax[1] - dbox->imin[1]; \ - } \ - else \ - { \ - databox##k.lsize1 = 1; \ - databox##k.strides1 = 0; \ - databox##k.bstart1 = 0; \ - databox##k.bsize1 = 0; \ - } \ - /* dim 2 */ \ - if (ndim == 3) \ - { \ - databox##k.lsize2 = loop_size[2]; \ - databox##k.strides2 = stride[2]; \ - databox##k.bstart2 = start[2] - dbox->imin[2]; \ - databox##k.bsize2 = dbox->imax[2] - dbox->imin[2]; \ - } \ - else \ - { \ - databox##k.lsize2 = 1; \ - databox##k.strides2 = 0; \ - databox##k.bstart2 = 0; \ - databox##k.bsize2 = 0; \ - } - -/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */ -#define hypre_newBoxLoopDeclare(box) \ - hypre_Index local_idx; \ - size_t idx_local = item.get_local_id(0); \ - hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ - idx_local = idx_local / box.lsize0; \ - hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ - idx_local = idx_local / box.lsize1; \ - hypre_IndexD(local_idx, 2) = idx_local % box.lsize2; \ - -/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */ -#define hypre_BoxLoopIncK(k, box, hypre__i) \ - HYPRE_Int hypre_boxD##k = 1; \ - HYPRE_Int hypre__i = 0; \ - hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \ - hypre_boxD##k *= hypre_max(0, box.bsize0 + 1); \ - hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \ - hypre_boxD##k *= hypre_max(0, box.bsize1 + 1); \ - hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \ - hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); - - - -/* BoxLoop 1 */ -#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ -{ \ - hypre_newBoxLoopInit(ndim, loop_size); \ - hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ - { \ - hypre_newBoxLoopDeclare(databox1); \ - hypre_BoxLoopIncK(1, databox1, i1); - -#define hypre_newBoxLoop1End(i1) \ - }, hypre__tot); \ -} - -#define my_hypre_BoxLoop1Begin hypre_newBoxLoop1Begin -#define my_hypre_BoxLoop1End hypre_newBoxLoop1End HYPRE_Int my_hypre_StructVectorSetConstantValues( hypre_StructVector *vector, @@ -208,14 +48,13 @@ my_hypre_StructVectorSetConstantValues( hypre_StructVector *vector, hypre_BoxGetSize(box, loop_size); - // WM: question - What's DEVICE_VAR? #define DEVICE_VAR is_device_ptr(vp) - my_hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, + hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, v_data_box, start, unit_stride, vi); { vp[vi] = values; } - my_hypre_BoxLoop1End(vi); + hypre_BoxLoop1End(vi); #undef DEVICE_VAR } @@ -274,6 +113,29 @@ my_hypre_StructAxpy( HYPRE_Complex alpha, } +/**************************** + * show device function copied from oneAPI examples + ****************************/ +#include +#include "dpc_common.hpp" + +void ShowDevice(sycl::queue &q) { + using namespace std; + using namespace sycl; + // Output platform and device information. + auto device = q.get_device(); + auto p_name = device.get_platform().get_info(); + cout << std::setw(20) << "Platform Name: " << p_name << "\n"; + auto p_version = device.get_platform().get_info(); + cout << std::setw(20) << "Platform Version: " << p_version << "\n"; + auto d_name = device.get_info(); + cout << std::setw(20) << "Device Name: " << d_name << "\n"; + auto max_work_group = device.get_info(); + cout << std::setw(20) << "Max Work Group: " << max_work_group << "\n"; + auto max_compute_units = device.get_info(); + cout << std::setw(20) << "Max Compute Units: " << max_compute_units << "\n\n"; +} + /**************************** * main ****************************/ @@ -282,6 +144,27 @@ hypre_int main( hypre_int argc, char *argv[] ) { + + /* initialize */ + /* hypre_MPI_Init(&argc, &argv); */ + /* HYPRE_Init(); */ + /* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); */ + + + /* sycl::queue my_queue(sycl::default_selector{}, dpc_common::exception_handler); */ + /* ShowDevice(my_queue); */ + + /* sycl::device gpu = sycl::device(sycl::cpu_selector{}); */ + /* sycl::device dev; */ + /* hypre_printf("is_host = %d\n", gpu.is_host()); */ + /* hypre_printf("is_cpu = %d\n", gpu.is_cpu()); */ + /* hypre_printf("is_cpu = %d\n", dev.is_cpu()); */ + /* hypre_printf("is_gpu = %d\n", gpu.is_gpu()); */ + /* hypre_printf("DONE\n"); */ + /* exit(0); */ + + + /* variables */ HYPRE_Int i, ix, iy, iz, ib; HYPRE_Int p, q, r; @@ -300,10 +183,10 @@ main( hypre_int argc, HYPRE_StructVector x; HYPRE_Int num_ghost[6] = {0, 0, 0, 0, 0, 0}; - dim = 1; + dim = 2; sym = 1; - nx = 1000; - ny = 1; + nx = 10; + ny = 10; nz = 1; bx = 1; by = 1; @@ -517,7 +400,11 @@ main( hypre_int argc, /* call set const */ - my_hypre_StructVectorSetConstantValues(x, 1.0); + my_hypre_StructVectorSetConstantValues(x, 5.0); + hypre_printf("my_hypre_StructVectorSetConstantValues() success!\n"); + + hypre_StructVectorSetConstantValues(x, 5.0); + hypre_printf("hypre_StructVectorSetConstantValues() success!\n"); /* call axpy */ /* my_hypre_StructAxpy(1.0, x, b); */ diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index f5dbdc07a1..78a136eeed 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -963,9 +963,11 @@ hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i) } }; - sycl::device syclDev = data->device; - sycl::context syclctxt = sycl::context(syclDev, sycl_asynchandler); - stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); + /* WM: having trouble with getting the device on frank, so temporarily just passing the default selector */ + /* sycl::device syclDev = data->device; */ + /* sycl::context syclctxt = sycl::context(syclDev, sycl_asynchandler); */ + /* stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); */ + stream = new sycl::queue(sycl::default_selector{}, sycl::property_list{sycl::property::queue::in_order{}}); data->streams[i] = stream; } #endif @@ -1222,7 +1224,8 @@ hypre_DeviceDataCreate() hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST); #if defined(HYPRE_USING_SYCL) - hypre_DeviceDataDevice(data) = sycl::device(sycl::gpu_selector{}); + /* WM: commenting out for now since I'm having trouble finding the device on frank */ + /* hypre_DeviceDataDevice(data) = sycl::device(sycl::gpu_selector{}); */ #else hypre_DeviceDataDevice(data) = 0; #endif @@ -1466,7 +1469,9 @@ hypre_bind_device( HYPRE_Int myid, hypre_MPI_Comm_free(&node_comm); /* get number of devices on this node */ - hypre_GetDeviceCount(&nDevices); + /* WM: doesn't work on frank... commenting out */ + /* hypre_GetDeviceCount(&nDevices); */ + nDevices = 1; /* set device */ device_id = myNodeid % nDevices; From 5695c978a44b04d649f2deb126dd6143d0542e58 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Wed, 29 Sep 2021 16:19:28 -0700 Subject: [PATCH 09/44] boxloop1 running on frank I have fixed my compilation issues and can now run with my sycl boxloop1 implementation on frank's sever machine. The boxloop1 code seems to be giving correct results as well, though it seems somewhere along the line I screwed up the struct solvers tests, which yield a discrepancy in number of iterations for the first solvers.jobs job. --- src/struct_mv/_hypre_struct_mv.hpp | 60 ++++++------------------------ src/struct_mv/boxloop_sycl.h | 52 ++++---------------------- src/test/Makefile | 3 +- src/test/simple.c | 19 ++++++---- src/utilities/memory.c | 7 ++++ 5 files changed, 39 insertions(+), 102 deletions(-) diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index 0cc8ba2619..d1866014f1 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1209,6 +1209,7 @@ BoxLoopforall( LOOP_BODY loop_body, } else if (exec_policy == HYPRE_EXEC_DEVICE) { + /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */ const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); @@ -1275,7 +1276,7 @@ BoxLoopforall( LOOP_BODY loop_body, /* WM: todo - double check that item.get_local_id(0) is actually what you want below */ #define hypre_newBoxLoopDeclare(box) \ hypre_Index local_idx; \ - size_t idx_local = item.get_local_id(0); \ + HYPRE_Int idx_local = (HYPRE_Int) idx; \ hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ idx_local = idx_local / box.lsize0; \ hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ @@ -1296,66 +1297,27 @@ BoxLoopforall( LOOP_BODY loop_body, /* BoxLoop 1 */ -/* #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ */ -/* { \ */ -/* hypre_newBoxLoopInit(ndim, loop_size); \ */ -/* hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ */ -/* hypre_printf("about to call BoxLoopfoall\n"); \ */ -/* BoxLoopforall( [=] (sycl::nd_item<1> item) \ */ -/* { \ */ -/* hypre_newBoxLoopDeclare(databox1); \ */ -/* hypre_BoxLoopIncK(1, databox1, i1); */ - -/* #define hypre_newBoxLoop1End(i1) \ */ -/* }, hypre__tot); \ */ -/* } */ - - - - - - -/* BoxLoop 1 */ -/* without the extra function call to BoxLoopforall */ #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ { \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); \ - const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(hypre__tot, "thread", bDim); \ - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), [=] (sycl::nd_item<1> item) \ + size_t idx = item.get_global_linear_id(); \ + if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ hypre_BoxLoopIncK(1, databox1, i1); #define hypre_newBoxLoop1End(i1) \ - }); \ - }).wait_and_throw(); \ + } \ + }, hypre__tot); \ } -#define my_hypre_BoxLoop1Begin hypre_newBoxLoop1Begin -#define my_hypre_BoxLoop1End hypre_newBoxLoop1End - - - - - - - - - - - - - - - @@ -1593,10 +1555,10 @@ BoxLoopforall( LOOP_BODY loop_body, #define hypre_BoxLoop0Begin zypre_newBoxLoop0Begin #define hypre_BoxLoop0End zypre_newBoxLoop0End /* WM: replacing boxloops one at a time starting with boxloop1 */ -#define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin -#define hypre_BoxLoop1End hypre_newBoxLoop1End -/* #define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin */ -/* #define hypre_BoxLoop1End zypre_newBoxLoop1End */ +/* #define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin */ +/* #define hypre_BoxLoop1End hypre_newBoxLoop1End */ +#define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin +#define hypre_BoxLoop1End zypre_newBoxLoop1End #define hypre_BoxLoop2Begin zypre_newBoxLoop2Begin #define hypre_BoxLoop2End zypre_newBoxLoop2End #define hypre_BoxLoop3Begin zypre_newBoxLoop3Begin diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index f4dbd9eb63..e4ac919d90 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -57,6 +57,7 @@ BoxLoopforall( LOOP_BODY loop_body, } else if (exec_policy == HYPRE_EXEC_DEVICE) { + /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */ const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); @@ -123,7 +124,7 @@ BoxLoopforall( LOOP_BODY loop_body, /* WM: todo - double check that item.get_local_id(0) is actually what you want below */ #define hypre_newBoxLoopDeclare(box) \ hypre_Index local_idx; \ - size_t idx_local = item.get_local_id(0); \ + HYPRE_Int idx_local = (HYPRE_Int) idx; \ hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ idx_local = idx_local / box.lsize0; \ hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ @@ -144,66 +145,27 @@ BoxLoopforall( LOOP_BODY loop_body, /* BoxLoop 1 */ -/* #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ */ -/* { \ */ -/* hypre_newBoxLoopInit(ndim, loop_size); \ */ -/* hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ */ -/* hypre_printf("about to call BoxLoopfoall\n"); \ */ -/* BoxLoopforall( [=] (sycl::nd_item<1> item) \ */ -/* { \ */ -/* hypre_newBoxLoopDeclare(databox1); \ */ -/* hypre_BoxLoopIncK(1, databox1, i1); */ - -/* #define hypre_newBoxLoop1End(i1) \ */ -/* }, hypre__tot); \ */ -/* } */ - - - - - - -/* BoxLoop 1 */ -/* without the extra function call to BoxLoopforall */ #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ { \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); \ - const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(hypre__tot, "thread", bDim); \ - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), [=] (sycl::nd_item<1> item) \ + size_t idx = item.get_global_linear_id(); \ + if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ hypre_BoxLoopIncK(1, databox1, i1); #define hypre_newBoxLoop1End(i1) \ - }); \ - }).wait_and_throw(); \ + } \ + }, hypre__tot); \ } -#define my_hypre_BoxLoop1Begin hypre_newBoxLoop1Begin -#define my_hypre_BoxLoop1End hypre_newBoxLoop1End - - - - - - - - - - - - - - - diff --git a/src/test/Makefile b/src/test/Makefile index 5a4c606193..a05bb46feb 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -40,7 +40,8 @@ F77_COMPILE_FLAGS = \ MPILIBFLAGS = ${MPILIBDIRS} ${MPILIBS} ${MPIFLAGS} LAPACKLIBFLAGS = ${LAPACKLIBDIRS} ${LAPACKLIBS} BLASLIBFLAGS = ${BLASLIBDIRS} ${BLASLIBS} -LIBFLAGS = ${LDFLAGS} ${LIBS} +# WM: had to add the absolute path to libHYPRE.a for successful compilation on frank +LIBFLAGS = ${LDFLAGS} ${LIBS} ${HYPRE_BUILD_DIR}/lib/libHYPRE.a ifeq ($(notdir $(firstword ${LINK_CC})), nvcc) XLINK = -Xlinker=-rpath,${HYPRE_BUILD_DIR}/lib diff --git a/src/test/simple.c b/src/test/simple.c index e8a953beed..a52260e1df 100644 --- a/src/test/simple.c +++ b/src/test/simple.c @@ -15,7 +15,7 @@ HYPRE_Int AddValuesVector( hypre_StructGrid *gridvector, HYPRE_Int -my_hypre_StructVectorSetConstantValues( hypre_StructVector *vector, +cpu_hypre_StructVectorSetConstantValues( hypre_StructVector *vector, HYPRE_Complex values ) { hypre_Box *v_data_box; @@ -49,12 +49,12 @@ my_hypre_StructVectorSetConstantValues( hypre_StructVector *vector, hypre_BoxGetSize(box, loop_size); #define DEVICE_VAR is_device_ptr(vp) - hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, + zypre_newBoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, v_data_box, start, unit_stride, vi); { vp[vi] = values; } - hypre_BoxLoop1End(vi); + zypre_newBoxLoop1End(vi); #undef DEVICE_VAR } @@ -96,7 +96,6 @@ my_hypre_StructAxpy( HYPRE_Complex alpha, hypre_BoxGetSize(box, loop_size); -/* WM: what is the DEVICE_VAR thing? */ #define DEVICE_VAR is_device_ptr(yp,xp) /* WM: todo */ /* my_hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size, */ @@ -183,11 +182,11 @@ main( hypre_int argc, HYPRE_StructVector x; HYPRE_Int num_ghost[6] = {0, 0, 0, 0, 0, 0}; - dim = 2; + dim = 3; sym = 1; nx = 10; ny = 10; - nz = 1; + nz = 10; bx = 1; by = 1; bz = 1; @@ -398,14 +397,20 @@ main( hypre_int argc, AddValuesVector(grid,x,periodic,1.0); HYPRE_StructVectorAssemble(x); + hypre_StructVector *y = hypre_StructVectorClone(x); + hypre_StructVectorPrint("before", x, 1); /* call set const */ - my_hypre_StructVectorSetConstantValues(x, 5.0); + cpu_hypre_StructVectorSetConstantValues(y, 5.0); hypre_printf("my_hypre_StructVectorSetConstantValues() success!\n"); + hypre_StructVectorPrint("after_cpu", y, 1); + hypre_StructVectorSetConstantValues(x, 5.0); hypre_printf("hypre_StructVectorSetConstantValues() success!\n"); + hypre_StructVectorPrint("after_gpu", x, 1); + /* call axpy */ /* my_hypre_StructAxpy(1.0, x, b); */ diff --git a/src/utilities/memory.c b/src/utilities/memory.c index ece79f5d68..55df0f6aa3 100644 --- a/src/utilities/memory.c +++ b/src/utilities/memory.c @@ -109,6 +109,7 @@ hypre_UnifiedMemset(void *ptr, HYPRE_Int value, size_t num) static inline void hypre_UnifiedMemPrefetch(void *ptr, size_t size, hypre_MemoryLocation location) { + /* hypre_printf("WM: debug - inside UnifiedMemPrefetch\n"); */ #if defined(HYPRE_USING_GPU) #ifdef HYPRE_DEBUG hypre_MemoryLocation tmp; @@ -244,6 +245,7 @@ hypre_DeviceMalloc(size_t size, HYPRE_Int zeroinit) static inline void * hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit) { + /* hypre_printf("WM: debug - inside UnifiedMalloc\n"); */ void *ptr = NULL; #if defined(HYPRE_USING_UMPIRE_UM) @@ -268,6 +270,7 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit) #if defined(HYPRE_USING_SYCL) HYPRE_SYCL_CALL( ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleComputeStream(hypre_handle()))) ); + /* hypre_printf("WM: debug - did the sycl shared allocation\n"); */ #endif #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */ @@ -275,6 +278,7 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit) /* prefecth to device */ if (ptr) { + /* hypre_printf("WM: debug - about to prefetch\n"); */ hypre_UnifiedMemPrefetch(ptr, size, hypre_MEMORY_DEVICE); } @@ -969,6 +973,7 @@ hypre_GetExecPolicy2(HYPRE_MemoryLocation location1, HYPRE_Int hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location) { + /* hypre_printf("WM: debug - inside GetPointerLocation\n"); */ HYPRE_Int ierr = 0; #if defined(HYPRE_USING_GPU) @@ -1071,6 +1076,7 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location) sycl::usm::alloc allocType; allocType = sycl::get_pointer_type(ptr, (hypre_HandleComputeStream(hypre_handle()))->get_context()); + /* hypre_printf("WM: debug - checking allocType\n"); */ if (allocType == sycl::usm::alloc::unknown) { *memory_location = hypre_MEMORY_HOST; @@ -1086,6 +1092,7 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location) else if (allocType == sycl::usm::alloc::shared) { *memory_location = hypre_MEMORY_UNIFIED; + /* hypre_printf("WM: debug - IS UNIFIED MEMORY\n"); */ } #endif //HYPRE_USING_SYCL From f4d9ba405b09ee9c4ce6d1af56fec7c57873e0bc Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Wed, 29 Sep 2021 16:56:32 -0700 Subject: [PATCH 10/44] Resolve further merge conflicts, passes struct tests --- src/utilities/_hypre_utilities.h | 2 +- src/utilities/_hypre_utilities.hpp | 2 +- src/utilities/device_utils.h | 2 +- src/utilities/handle.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index f66d302115..7faf7f9a1d 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -1285,7 +1285,7 @@ typedef struct #define hypre_HandleStructCommRecvBufferSize(hypre_handle) hypre_DeviceDataStructCommRecvBufferSize(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleStructCommSendBufferSize(hypre_handle) hypre_DeviceDataStructCommSendBufferSize(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleSpgemmUseCusparse(hypre_handle) hypre_DeviceDataSpgemmUseCusparse(hypre_HandleDeviceData(hypre_handle)) -#define hypre_HandleSpgemmNumPasses(hypre_handle) hypre_DeviceDataSpgemmNumPasses(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmAlgorithm(hypre_handle) hypre_DeviceDataSpgemmAlgorithm(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMethod(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateNsamples(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMultFactor(hypre_HandleDeviceData(hypre_handle)) diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index bc609bdc3d..61e8ae0998 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -320,7 +320,7 @@ struct hypre_DeviceData #define hypre_DeviceDataStructCommRecvBufferSize(data) ((data) -> struct_comm_recv_buffer_size) #define hypre_DeviceDataStructCommSendBufferSize(data) ((data) -> struct_comm_send_buffer_size) #define hypre_DeviceDataSpgemmUseCusparse(data) ((data) -> spgemm_use_cusparse) -#define hypre_DeviceDataSpgemmAlgorithm(data) ((data) -> spgemm_num_passes) +#define hypre_DeviceDataSpgemmAlgorithm(data) ((data) -> spgemm_algorithm) #define hypre_DeviceDataSpgemmRownnzEstimateMethod(data) ((data) -> spgemm_rownnz_estimate_method) #define hypre_DeviceDataSpgemmRownnzEstimateNsamples(data) ((data) -> spgemm_rownnz_estimate_nsamples) #define hypre_DeviceDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor) diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 41c6c41659..3483361926 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -268,7 +268,7 @@ struct hypre_DeviceData #define hypre_DeviceDataStructCommRecvBufferSize(data) ((data) -> struct_comm_recv_buffer_size) #define hypre_DeviceDataStructCommSendBufferSize(data) ((data) -> struct_comm_send_buffer_size) #define hypre_DeviceDataSpgemmUseCusparse(data) ((data) -> spgemm_use_cusparse) -#define hypre_DeviceDataSpgemmAlgorithm(data) ((data) -> spgemm_num_passes) +#define hypre_DeviceDataSpgemmAlgorithm(data) ((data) -> spgemm_algorithm) #define hypre_DeviceDataSpgemmRownnzEstimateMethod(data) ((data) -> spgemm_rownnz_estimate_method) #define hypre_DeviceDataSpgemmRownnzEstimateNsamples(data) ((data) -> spgemm_rownnz_estimate_nsamples) #define hypre_DeviceDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor) diff --git a/src/utilities/handle.h b/src/utilities/handle.h index c49fa5fdf3..8e5979c7a2 100644 --- a/src/utilities/handle.h +++ b/src/utilities/handle.h @@ -74,7 +74,7 @@ typedef struct #define hypre_HandleStructCommRecvBufferSize(hypre_handle) hypre_DeviceDataStructCommRecvBufferSize(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleStructCommSendBufferSize(hypre_handle) hypre_DeviceDataStructCommSendBufferSize(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleSpgemmUseCusparse(hypre_handle) hypre_DeviceDataSpgemmUseCusparse(hypre_HandleDeviceData(hypre_handle)) -#define hypre_HandleSpgemmNumPasses(hypre_handle) hypre_DeviceDataSpgemmNumPasses(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleSpgemmAlgorithm(hypre_handle) hypre_DeviceDataSpgemmAlgorithm(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMethod(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateNsamples(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMultFactor(hypre_HandleDeviceData(hypre_handle)) From 845a433bd44bc2fca5c493f625f6098eaf63555c Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Thu, 30 Sep 2021 10:53:29 -0700 Subject: [PATCH 11/44] Non-reduction boxloops done The non-reduction boxloops are all in and pass the struct tests. Performance is VERY slow, but this may just be due to the machine I am running on. Reduction boxloops are in progress. --- src/struct_ls/_hypre_struct_ls.h | 2 +- src/struct_ls/protos.h | 5 + src/struct_mv/_hypre_struct_mv.hpp | 310 +++++++++++++++++++++++++++-- src/struct_mv/boxloop_sycl.h | 306 ++++++++++++++++++++++++++-- 4 files changed, 580 insertions(+), 43 deletions(-) diff --git a/src/struct_ls/_hypre_struct_ls.h b/src/struct_ls/_hypre_struct_ls.h index f8a753e6ee..4078385df0 100644 --- a/src/struct_ls/_hypre_struct_ls.h +++ b/src/struct_ls/_hypre_struct_ls.h @@ -450,9 +450,9 @@ HYPRE_Int hypre_SparseMSGSetupRAPOp ( hypre_StructMatrix *R , hypre_StructMatrix /* sparse_msg_solve.c */ HYPRE_Int hypre_SparseMSGSolve ( void *smsg_vdata , hypre_StructMatrix *A , hypre_StructVector *b , hypre_StructVector *x ); - #ifdef __cplusplus } #endif #endif + diff --git a/src/struct_ls/protos.h b/src/struct_ls/protos.h index 67540ac062..a7187c4016 100644 --- a/src/struct_ls/protos.h +++ b/src/struct_ls/protos.h @@ -5,6 +5,11 @@ * SPDX-License-Identifier: (Apache-2.0 OR MIT) ******************************************************************************/ +/* coarsen.c */ +HYPRE_Int hypre_StructMapFineToCoarse ( hypre_Index findex , hypre_Index index , hypre_Index stride , hypre_Index cindex ); +HYPRE_Int hypre_StructMapCoarseToFine ( hypre_Index cindex , hypre_Index index , hypre_Index stride , hypre_Index findex ); +HYPRE_Int hypre_StructCoarsen ( hypre_StructGrid *fgrid , hypre_Index index , hypre_Index stride , HYPRE_Int prune , hypre_StructGrid **cgrid_ptr ); + /* cyclic_reduction.c */ void *hypre_CyclicReductionCreate ( MPI_Comm comm ); hypre_StructMatrix *hypre_CycRedCreateCoarseOp ( hypre_StructMatrix *A , hypre_StructGrid *coarse_grid , HYPRE_Int cdir ); diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index d1866014f1..de88e1b0cd 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1184,7 +1184,7 @@ extern "C++" { #endif /********************************************************************* - * forall function + * wrapper functions calling sycl parallel_for *********************************************************************/ template @@ -1220,6 +1220,59 @@ BoxLoopforall( LOOP_BODY loop_body, } } +template +void +ReductionBoxLoopforall( HYPRE_Int length, + REDUCER &reducer, + LOOP_BODY loop_body ) +{ + if (length <= 0) + { + return; + } + + /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ + /* WM: TODO: uncomment above and remove below */ + HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; + + if (exec_policy == HYPRE_EXEC_HOST) + { + /* WM: todo - is this really necessary, even? */ + /* for (HYPRE_Int idx = 0; idx < length; idx++) */ + /* { */ + /* loop_body(idx, reducer); */ + /* } */ + } + else if (exec_policy == HYPRE_EXEC_DEVICE) + { + /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */ + /* NOTE: in the cuda version, there is further manipulation of bDim and gDim that I don't include here */ + const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + + reducer.nblocks = gDim.size(); + + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) + { + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body); + }).wait_and_throw(); + } +} + + + + + + + + + + + + + + + #ifdef __cplusplus } #endif @@ -1272,11 +1325,46 @@ BoxLoopforall( LOOP_BODY loop_body, databox##k.bsize2 = 0; \ } +#define hypre_BasicBoxLoopDataDeclareK(k,ndim,loop_size,stride) \ +hypre_Boxloop databox##k; \ +databox##k.lsize0 = loop_size[0]; \ +databox##k.strides0 = stride[0]; \ +databox##k.bstart0 = 0; \ +databox##k.bsize0 = 0; \ +if (ndim > 1) \ +{ \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ +} \ +else \ +{ \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ +} \ +if (ndim == 3) \ +{ \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ +} \ +else \ +{ \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ +} + /* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */ /* WM: todo - double check that item.get_local_id(0) is actually what you want below */ #define hypre_newBoxLoopDeclare(box) \ hypre_Index local_idx; \ - HYPRE_Int idx_local = (HYPRE_Int) idx; \ + HYPRE_Int idx_local = idx; \ hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ idx_local = idx_local / box.lsize0; \ hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ @@ -1294,8 +1382,30 @@ BoxLoopforall( LOOP_BODY loop_body, hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \ hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); +/* get 3-D local_idx into 'index' */ +#define hypre_BoxLoopGetIndex(index) \ + index[0] = hypre_IndexD(local_idx, 0); \ + index[1] = hypre_IndexD(local_idx, 1); \ + index[2] = hypre_IndexD(local_idx, 2); + + + +/********************************************************************* + * Boxloops + *********************************************************************/ +/* BoxLoop 0 */ +#define hypre_newBoxLoop0Begin(ndim, loop_size) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + BoxLoopforall(hypre__tot, HYPRE_LAMBDA (HYPRE_Int idx) \ + { + +#define hypre_newBoxLoop0End() \ + }); \ +} + /* BoxLoop 1 */ #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ { \ @@ -1303,7 +1413,7 @@ BoxLoopforall( LOOP_BODY loop_body, hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - size_t idx = item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -1314,6 +1424,147 @@ BoxLoopforall( LOOP_BODY loop_body, }, hypre__tot); \ } +/* BoxLoop 2 */ +#define hypre_newBoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_BoxLoopIncK(2, databox2, i2); + +#define hypre_newBoxLoop2End(i1, i2) \ + } \ + }, hypre__tot); \ +} + +/* BoxLoop 3 */ +#define hypre_newBoxLoop3Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim,loop_size, dbox1, start1, stride1); \ + hypre_BoxLoopDataDeclareK(2, ndim,loop_size, dbox2, start2, stride2); \ + hypre_BoxLoopDataDeclareK(3, ndim,loop_size, dbox3, start3, stride3); \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_BoxLoopIncK(2, databox2, i2); \ + hypre_BoxLoopIncK(3, databox3, i3); + +#define hypre_newBoxLoop3End(i1, i2, i3) \ + } \ + }, hypre__tot); \ +} + +/* BoxLoop 4 */ +#define hypre_newBoxLoop4Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3, \ + dbox4, start4, stride4, i4) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ + hypre_BoxLoopDataDeclareK(3, ndim, loop_size, dbox3, start3, stride3); \ + hypre_BoxLoopDataDeclareK(4, ndim, loop_size, dbox4, start4, stride4); \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_BoxLoopIncK(2, databox2, i2); \ + hypre_BoxLoopIncK(3, databox3, i3); \ + hypre_BoxLoopIncK(4, databox4, i4); + +#define hypre_newBoxLoop4End(i1, i2, i3, i4) \ + } \ + }, hypre__tot); \ +} + + +/* Basic BoxLoops have no boxes */ +/* BoxLoop 1 */ +#define hypre_newBasicBoxLoop1Begin(ndim, loop_size, stride1, i1) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1); \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); + +/* BoxLoop 2 */ +#define hypre_newBasicBoxLoop2Begin(ndim, loop_size, stride1, i1, stride2, i2) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1); \ + hypre_BasicBoxLoopDataDeclareK(2, ndim, loop_size, stride2); \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_BoxLoopIncK(2, databox2, i2); + + +/* Reduction BoxLoop1 */ +#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + BoxLoopforall( [=] (sycl::nd_item<1> item, decltype(reducesum) &reducesum) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1);\ + const HYPRE_Int thread_id = (HYPRE_Int) item.get_global_linear_id();\ + const HYPRE_Int n_threads = (HYPRE_Int) item.get_global_range().size();\ + for (HYPRE_Int idx = thread_id; idx < length; idx += n_threads)\ + { + +#define hypre_newBoxLoop1ReductionEnd(i1, reducesum) \ + reducer.BlockReduce();\ + } \ + }, hypre__tot); \ +} + +/* Reduction BoxLoop2 */ +#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, reducesum) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ + ReductionBoxLoopforall(hypre__tot, reducesum, HYPRE_LAMBDA (HYPRE_Int idx, decltype(reducesum) &reducesum) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_BoxLoopIncK(2, databox2, i2); + +#define hypre_newBoxLoop2ReductionEnd(i1, i2, reducesum) \ + }); \ +} + + @@ -1549,26 +1800,41 @@ BoxLoopforall( LOOP_BODY loop_body, } \ } -#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex -#define hypre_BoxLoopBlock zypre_BoxLoopBlock -#define hypre_BoxLoop0Begin zypre_newBoxLoop0Begin -#define hypre_BoxLoop0End zypre_newBoxLoop0End -/* WM: replacing boxloops one at a time starting with boxloop1 */ -/* #define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin */ -/* #define hypre_BoxLoop1End hypre_newBoxLoop1End */ -#define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin -#define hypre_BoxLoop1End zypre_newBoxLoop1End -#define hypre_BoxLoop2Begin zypre_newBoxLoop2Begin -#define hypre_BoxLoop2End zypre_newBoxLoop2End -#define hypre_BoxLoop3Begin zypre_newBoxLoop3Begin -#define hypre_BoxLoop3End zypre_newBoxLoop3End -#define hypre_BoxLoop4Begin zypre_newBoxLoop4Begin -#define hypre_BoxLoop4End zypre_newBoxLoop4End -#define hypre_BasicBoxLoop2Begin zypre_newBasicBoxLoop2Begin + + + + + + + + + + + + +/********************************************************************* + * renamings + *********************************************************************/ + +#define hypre_BoxLoopBlock() 0 + +#define hypre_BoxLoop0Begin hypre_newBoxLoop0Begin +#define hypre_BoxLoop0End hypre_newBoxLoop0End +#define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin +#define hypre_BoxLoop1End hypre_newBoxLoop1End +#define hypre_BoxLoop2Begin hypre_newBoxLoop2Begin +#define hypre_BoxLoop2End hypre_newBoxLoop2End +#define hypre_BoxLoop3Begin hypre_newBoxLoop3Begin +#define hypre_BoxLoop3End hypre_newBoxLoop3End +#define hypre_BoxLoop4Begin hypre_newBoxLoop4Begin +#define hypre_BoxLoop4End hypre_newBoxLoop4End + +#define hypre_BasicBoxLoop1Begin hypre_newBasicBoxLoop1Begin +#define hypre_BasicBoxLoop2Begin hypre_newBasicBoxLoop2Begin /* Reduction */ -/* WM: todo */ +/* WM: todo - using CPU version for now */ #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) @@ -1577,11 +1843,11 @@ BoxLoopforall( LOOP_BODY loop_body, #define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ dbox2, start2, stride2, i2, reducesum) \ - hypre_BoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ + zypre_newBoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ dbox2, start2, stride2, i2) #define hypre_BoxLoop2ReductionEnd(i1, i2, reducesum) \ - hypre_BoxLoop2End(i1, i2) + zypre_newBoxLoop2End(i1, i2) #endif diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index e4ac919d90..b2cb757231 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -32,7 +32,7 @@ extern "C++" { #endif /********************************************************************* - * forall function + * wrapper functions calling sycl parallel_for *********************************************************************/ template @@ -68,6 +68,59 @@ BoxLoopforall( LOOP_BODY loop_body, } } +template +void +ReductionBoxLoopforall( HYPRE_Int length, + REDUCER &reducer, + LOOP_BODY loop_body ) +{ + if (length <= 0) + { + return; + } + + /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ + /* WM: TODO: uncomment above and remove below */ + HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; + + if (exec_policy == HYPRE_EXEC_HOST) + { + /* WM: todo - is this really necessary, even? */ + /* for (HYPRE_Int idx = 0; idx < length; idx++) */ + /* { */ + /* loop_body(idx, reducer); */ + /* } */ + } + else if (exec_policy == HYPRE_EXEC_DEVICE) + { + /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */ + /* NOTE: in the cuda version, there is further manipulation of bDim and gDim that I don't include here */ + const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + + reducer.nblocks = gDim.size(); + + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) + { + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body); + }).wait_and_throw(); + } +} + + + + + + + + + + + + + + + #ifdef __cplusplus } #endif @@ -120,11 +173,46 @@ BoxLoopforall( LOOP_BODY loop_body, databox##k.bsize2 = 0; \ } +#define hypre_BasicBoxLoopDataDeclareK(k,ndim,loop_size,stride) \ +hypre_Boxloop databox##k; \ +databox##k.lsize0 = loop_size[0]; \ +databox##k.strides0 = stride[0]; \ +databox##k.bstart0 = 0; \ +databox##k.bsize0 = 0; \ +if (ndim > 1) \ +{ \ + databox##k.lsize1 = loop_size[1]; \ + databox##k.strides1 = stride[1]; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ +} \ +else \ +{ \ + databox##k.lsize1 = 1; \ + databox##k.strides1 = 0; \ + databox##k.bstart1 = 0; \ + databox##k.bsize1 = 0; \ +} \ +if (ndim == 3) \ +{ \ + databox##k.lsize2 = loop_size[2]; \ + databox##k.strides2 = stride[2]; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ +} \ +else \ +{ \ + databox##k.lsize2 = 1; \ + databox##k.strides2 = 0; \ + databox##k.bstart2 = 0; \ + databox##k.bsize2 = 0; \ +} + /* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */ /* WM: todo - double check that item.get_local_id(0) is actually what you want below */ #define hypre_newBoxLoopDeclare(box) \ hypre_Index local_idx; \ - HYPRE_Int idx_local = (HYPRE_Int) idx; \ + HYPRE_Int idx_local = idx; \ hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ idx_local = idx_local / box.lsize0; \ hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ @@ -142,7 +230,29 @@ BoxLoopforall( LOOP_BODY loop_body, hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \ hypre_boxD##k *= hypre_max(0, box.bsize2 + 1); +/* get 3-D local_idx into 'index' */ +#define hypre_BoxLoopGetIndex(index) \ + index[0] = hypre_IndexD(local_idx, 0); \ + index[1] = hypre_IndexD(local_idx, 1); \ + index[2] = hypre_IndexD(local_idx, 2); + + + +/********************************************************************* + * Boxloops + *********************************************************************/ + + +/* BoxLoop 0 */ +#define hypre_newBoxLoop0Begin(ndim, loop_size) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + BoxLoopforall(hypre__tot, HYPRE_LAMBDA (HYPRE_Int idx) \ + { +#define hypre_newBoxLoop0End() \ + }); \ +} /* BoxLoop 1 */ #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ @@ -151,7 +261,7 @@ BoxLoopforall( LOOP_BODY loop_body, hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - size_t idx = item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -162,6 +272,147 @@ BoxLoopforall( LOOP_BODY loop_body, }, hypre__tot); \ } +/* BoxLoop 2 */ +#define hypre_newBoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_BoxLoopIncK(2, databox2, i2); + +#define hypre_newBoxLoop2End(i1, i2) \ + } \ + }, hypre__tot); \ +} + +/* BoxLoop 3 */ +#define hypre_newBoxLoop3Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim,loop_size, dbox1, start1, stride1); \ + hypre_BoxLoopDataDeclareK(2, ndim,loop_size, dbox2, start2, stride2); \ + hypre_BoxLoopDataDeclareK(3, ndim,loop_size, dbox3, start3, stride3); \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_BoxLoopIncK(2, databox2, i2); \ + hypre_BoxLoopIncK(3, databox3, i3); + +#define hypre_newBoxLoop3End(i1, i2, i3) \ + } \ + }, hypre__tot); \ +} + +/* BoxLoop 4 */ +#define hypre_newBoxLoop4Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, \ + dbox3, start3, stride3, i3, \ + dbox4, start4, stride4, i4) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ + hypre_BoxLoopDataDeclareK(3, ndim, loop_size, dbox3, start3, stride3); \ + hypre_BoxLoopDataDeclareK(4, ndim, loop_size, dbox4, start4, stride4); \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_BoxLoopIncK(2, databox2, i2); \ + hypre_BoxLoopIncK(3, databox3, i3); \ + hypre_BoxLoopIncK(4, databox4, i4); + +#define hypre_newBoxLoop4End(i1, i2, i3, i4) \ + } \ + }, hypre__tot); \ +} + + +/* Basic BoxLoops have no boxes */ +/* BoxLoop 1 */ +#define hypre_newBasicBoxLoop1Begin(ndim, loop_size, stride1, i1) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1); \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); + +/* BoxLoop 2 */ +#define hypre_newBasicBoxLoop2Begin(ndim, loop_size, stride1, i1, stride2, i2) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1); \ + hypre_BasicBoxLoopDataDeclareK(2, ndim, loop_size, stride2); \ + BoxLoopforall( [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_BoxLoopIncK(2, databox2, i2); + + +/* Reduction BoxLoop1 */ +#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + BoxLoopforall( [=] (sycl::nd_item<1> item, decltype(reducesum) &reducesum) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1);\ + const HYPRE_Int thread_id = (HYPRE_Int) item.get_global_linear_id();\ + const HYPRE_Int n_threads = (HYPRE_Int) item.get_global_range().size();\ + for (HYPRE_Int idx = thread_id; idx < length; idx += n_threads)\ + { + +#define hypre_newBoxLoop1ReductionEnd(i1, reducesum) \ + reducer.BlockReduce();\ + } \ + }, hypre__tot); \ +} + +/* Reduction BoxLoop2 */ +#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, reducesum) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ + ReductionBoxLoopforall(hypre__tot, reducesum, HYPRE_LAMBDA (HYPRE_Int idx, decltype(reducesum) &reducesum) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_BoxLoopIncK(2, databox2, i2); + +#define hypre_newBoxLoop2ReductionEnd(i1, i2, reducesum) \ + }); \ +} + + @@ -397,26 +648,41 @@ BoxLoopforall( LOOP_BODY loop_body, } \ } -#define hypre_BoxLoopGetIndex zypre_BoxLoopGetIndex -#define hypre_BoxLoopBlock zypre_BoxLoopBlock -#define hypre_BoxLoop0Begin zypre_newBoxLoop0Begin -#define hypre_BoxLoop0End zypre_newBoxLoop0End -/* WM: replacing boxloops one at a time starting with boxloop1 */ + + + + + + + + + + + + +/********************************************************************* + * renamings + *********************************************************************/ + +#define hypre_BoxLoopBlock() 0 + +#define hypre_BoxLoop0Begin hypre_newBoxLoop0Begin +#define hypre_BoxLoop0End hypre_newBoxLoop0End #define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin #define hypre_BoxLoop1End hypre_newBoxLoop1End -/* #define hypre_BoxLoop1Begin zypre_newBoxLoop1Begin */ -/* #define hypre_BoxLoop1End zypre_newBoxLoop1End */ -#define hypre_BoxLoop2Begin zypre_newBoxLoop2Begin -#define hypre_BoxLoop2End zypre_newBoxLoop2End -#define hypre_BoxLoop3Begin zypre_newBoxLoop3Begin -#define hypre_BoxLoop3End zypre_newBoxLoop3End -#define hypre_BoxLoop4Begin zypre_newBoxLoop4Begin -#define hypre_BoxLoop4End zypre_newBoxLoop4End -#define hypre_BasicBoxLoop2Begin zypre_newBasicBoxLoop2Begin +#define hypre_BoxLoop2Begin hypre_newBoxLoop2Begin +#define hypre_BoxLoop2End hypre_newBoxLoop2End +#define hypre_BoxLoop3Begin hypre_newBoxLoop3Begin +#define hypre_BoxLoop3End hypre_newBoxLoop3End +#define hypre_BoxLoop4Begin hypre_newBoxLoop4Begin +#define hypre_BoxLoop4End hypre_newBoxLoop4End + +#define hypre_BasicBoxLoop1Begin hypre_newBasicBoxLoop1Begin +#define hypre_BasicBoxLoop2Begin hypre_newBasicBoxLoop2Begin /* Reduction */ -/* WM: todo */ +/* WM: todo - using CPU version for now */ #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) @@ -425,10 +691,10 @@ BoxLoopforall( LOOP_BODY loop_body, #define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ dbox2, start2, stride2, i2, reducesum) \ - hypre_BoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ + zypre_newBoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ dbox2, start2, stride2, i2) #define hypre_BoxLoop2ReductionEnd(i1, i2, reducesum) \ - hypre_BoxLoop2End(i1, i2) + zypre_newBoxLoop2End(i1, i2) #endif From 4ed00c48458dfeca2f5d8738c56a3908fc878139 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Thu, 30 Sep 2021 17:34:24 -0700 Subject: [PATCH 12/44] First attempt at reduction boxloops, seg faulting right now --- src/struct_ls/pfmg_setup.c | 43 +++++++++++++----- src/struct_mv/_hypre_struct_mv.hpp | 71 ++++++++--------------------- src/struct_mv/boxloop_sycl.h | 73 +++++++++--------------------- src/utilities/device_utils.h | 6 +++ 4 files changed, 79 insertions(+), 114 deletions(-) diff --git a/src/struct_ls/pfmg_setup.c b/src/struct_ls/pfmg_setup.c index b3db006d6e..fbf91d16d3 100644 --- a/src/struct_ls/pfmg_setup.c +++ b/src/struct_ls/pfmg_setup.c @@ -809,18 +809,23 @@ hypre_PFMGComputeDxyz( hypre_StructMatrix *A, switch (stencil_size) { case 5: + hypre_printf("WM: debug - stencil size = 5\n"); hypre_PFMGComputeDxyz_SS5 (i, A, cxyz, sqcxyz); break; case 9: + hypre_printf("WM: debug - stencil size = 9\n"); hypre_PFMGComputeDxyz_SS9 (i, A, cxyz, sqcxyz); break; case 7: + hypre_printf("WM: debug - stencil size = 7\n"); hypre_PFMGComputeDxyz_SS7 (i, A, cxyz, sqcxyz); break; case 19: + hypre_printf("WM: debug - stencil size = 19\n"); hypre_PFMGComputeDxyz_SS19(i, A, cxyz, sqcxyz); break; case 27: + hypre_printf("WM: debug - stencil size = 27\n"); hypre_PFMGComputeDxyz_SS27(i, A, cxyz, sqcxyz); break; default: @@ -1051,50 +1056,66 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int bi, hypre_SetIndex3(index, 0, 1, 0); a_cn = hypre_StructMatrixExtractPointerByIndex(A, bi, index); - // FIXME TODO HOW TO DO KOKKOS IN ONE BOXLOOP ? -#if defined(HYPRE_USING_KOKKOS) + // FIXME TODO HOW TO DO KOKKOS (WM: and SYCL) IN ONE BOXLOOP ? +#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL) HYPRE_Real cxb = cxyz[0]; - hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, A_dbox, start, stride, Ai, cxb) { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcx; +#else cxb += tcx; +#endif } - hypre_BoxLoop1ReductionEnd(Ai, cxb) + hypre_newBoxLoop1ReductionEnd(Ai, cxb) HYPRE_Real cyb = cxyz[1]; - hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, A_dbox, start, stride, Ai, cyb) { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cn[Ai] + a_cs[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcy; +#else cyb += tcy; +#endif } - hypre_BoxLoop1ReductionEnd(Ai, cyb) + hypre_newBoxLoop1ReductionEnd(Ai, cyb) HYPRE_Real sqcxb = sqcxyz[0]; - hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, A_dbox, start, stride, Ai, sqcxb) { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcx * tcx; +#else sqcxb += tcx * tcx; +#endif } - hypre_BoxLoop1ReductionEnd(Ai, sqcxb) + hypre_newBoxLoop1ReductionEnd(Ai, sqcxb) HYPRE_Real sqcyb = sqcxyz[1]; - hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, A_dbox, start, stride, Ai, sqcyb) { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cn[Ai] + a_cs[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcy * tcy; +#else sqcyb += tcy * tcy; +#endif } - hypre_BoxLoop1ReductionEnd(Ai, sqcyb) + hypre_newBoxLoop1ReductionEnd(Ai, sqcyb) -#else /* kokkos */ +#else // #if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL) #if defined(HYPRE_USING_RAJA) ReduceSum cxb(cxyz[0]),cyb(cxyz[1]),sqcxb(sqcxyz[0]),sqcyb(sqcxyz[1]); diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index de88e1b0cd..13982b41cb 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1220,11 +1220,11 @@ BoxLoopforall( LOOP_BODY loop_body, } } -template +template void -ReductionBoxLoopforall( HYPRE_Int length, - REDUCER &reducer, - LOOP_BODY loop_body ) +ReductionBoxLoopforall( LOOP_BODY loop_body, + HYPRE_Int length, + sycl::buffer sum_buf ) { if (length <= 0) { @@ -1250,11 +1250,10 @@ ReductionBoxLoopforall( HYPRE_Int length, const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); - reducer.nblocks = gDim.size(); - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body); + sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write); + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), loop_body); }).wait_and_throw(); } } @@ -1394,18 +1393,6 @@ else \ * Boxloops *********************************************************************/ - -/* BoxLoop 0 */ -#define hypre_newBoxLoop0Begin(ndim, loop_size) \ -{ \ - hypre_newBoxLoopInit(ndim, loop_size); \ - BoxLoopforall(hypre__tot, HYPRE_LAMBDA (HYPRE_Int idx) \ - { - -#define hypre_newBoxLoop0End() \ - }); \ -} - /* BoxLoop 1 */ #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ { \ @@ -1528,40 +1515,24 @@ else \ /* Reduction BoxLoop1 */ -#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ +/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */ +/* Right now, it is hardcoded as a HYPRE_Real */ +#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var) \ { \ - hypre_newBoxLoopInit(ndim, loop_size); \ - hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - BoxLoopforall( [=] (sycl::nd_item<1> item, decltype(reducesum) &reducesum) \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + sycl::buffer sum_buf(&sum_var, 1); \ + ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ { \ - hypre_newBoxLoopDeclare(databox1); \ - hypre_BoxLoopIncK(1, databox1, i1);\ - const HYPRE_Int thread_id = (HYPRE_Int) item.get_global_linear_id();\ - const HYPRE_Int n_threads = (HYPRE_Int) item.get_global_range().size();\ - for (HYPRE_Int idx = thread_id; idx < length; idx += n_threads)\ - { + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); -#define hypre_newBoxLoop1ReductionEnd(i1, reducesum) \ - reducer.BlockReduce();\ +#define hypre_newBoxLoop1ReductionEnd(i1, sum_var) \ } \ - }, hypre__tot); \ -} - -/* Reduction BoxLoop2 */ -#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ - dbox2, start2, stride2, i2, reducesum) \ -{ \ - hypre_newBoxLoopInit(ndim, loop_size); \ - hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ - ReductionBoxLoopforall(hypre__tot, reducesum, HYPRE_LAMBDA (HYPRE_Int idx, decltype(reducesum) &reducesum) \ - { \ - hypre_newBoxLoopDeclare(databox1); \ - hypre_BoxLoopIncK(1, databox1, i1); \ - hypre_BoxLoopIncK(2, databox2, i2); - -#define hypre_newBoxLoop2ReductionEnd(i1, i2, reducesum) \ - }); \ + }, hypre__tot, sum_buf); \ } @@ -1819,8 +1790,6 @@ else \ #define hypre_BoxLoopBlock() 0 -#define hypre_BoxLoop0Begin hypre_newBoxLoop0Begin -#define hypre_BoxLoop0End hypre_newBoxLoop0End #define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin #define hypre_BoxLoop1End hypre_newBoxLoop1End #define hypre_BoxLoop2Begin hypre_newBoxLoop2Begin diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index b2cb757231..0a4fae81f0 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -68,11 +68,11 @@ BoxLoopforall( LOOP_BODY loop_body, } } -template +template void -ReductionBoxLoopforall( HYPRE_Int length, - REDUCER &reducer, - LOOP_BODY loop_body ) +ReductionBoxLoopforall( LOOP_BODY loop_body, + HYPRE_Int length, + sycl::buffer sum_buf ) { if (length <= 0) { @@ -98,11 +98,10 @@ ReductionBoxLoopforall( HYPRE_Int length, const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); - reducer.nblocks = gDim.size(); - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body); + sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write); + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), loop_body); }).wait_and_throw(); } } @@ -242,18 +241,6 @@ else \ * Boxloops *********************************************************************/ - -/* BoxLoop 0 */ -#define hypre_newBoxLoop0Begin(ndim, loop_size) \ -{ \ - hypre_newBoxLoopInit(ndim, loop_size); \ - BoxLoopforall(hypre__tot, HYPRE_LAMBDA (HYPRE_Int idx) \ - { - -#define hypre_newBoxLoop0End() \ - }); \ -} - /* BoxLoop 1 */ #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ { \ @@ -376,40 +363,24 @@ else \ /* Reduction BoxLoop1 */ -#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ +/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */ +/* Right now, it is hardcoded as a HYPRE_Real */ +#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var) \ { \ - hypre_newBoxLoopInit(ndim, loop_size); \ - hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - BoxLoopforall( [=] (sycl::nd_item<1> item, decltype(reducesum) &reducesum) \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + sycl::buffer sum_buf(&sum_var, 1); \ + ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ { \ - hypre_newBoxLoopDeclare(databox1); \ - hypre_BoxLoopIncK(1, databox1, i1);\ - const HYPRE_Int thread_id = (HYPRE_Int) item.get_global_linear_id();\ - const HYPRE_Int n_threads = (HYPRE_Int) item.get_global_range().size();\ - for (HYPRE_Int idx = thread_id; idx < length; idx += n_threads)\ - { - -#define hypre_newBoxLoop1ReductionEnd(i1, reducesum) \ - reducer.BlockReduce();\ - } \ - }, hypre__tot); \ -} + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); -/* Reduction BoxLoop2 */ -#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ - dbox2, start2, stride2, i2, reducesum) \ -{ \ - hypre_newBoxLoopInit(ndim, loop_size); \ - hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ - ReductionBoxLoopforall(hypre__tot, reducesum, HYPRE_LAMBDA (HYPRE_Int idx, decltype(reducesum) &reducesum) \ - { \ - hypre_newBoxLoopDeclare(databox1); \ - hypre_BoxLoopIncK(1, databox1, i1); \ - hypre_BoxLoopIncK(2, databox2, i2); - -#define hypre_newBoxLoop2ReductionEnd(i1, i2, reducesum) \ - }); \ +#define hypre_newBoxLoop1ReductionEnd(i1, sum_var) \ + } \ + }, hypre__tot, sum_buf); \ } @@ -667,8 +638,6 @@ else \ #define hypre_BoxLoopBlock() 0 -#define hypre_BoxLoop0Begin hypre_newBoxLoop0Begin -#define hypre_BoxLoop0End hypre_newBoxLoop0End #define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin #define hypre_BoxLoop1End hypre_newBoxLoop1End #define hypre_BoxLoop2Begin hypre_newBoxLoop2Begin diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 3483361926..9d95075c3e 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -54,6 +54,7 @@ #elif defined(HYPRE_USING_SYCL) #include +/* WM: todo - include below as necessary */ /* #include */ /* #include */ /* #include */ @@ -226,6 +227,7 @@ struct hypre_DeviceData #endif #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + /* WM: question - what is the device_allocator? */ hypre_device_allocator device_allocator; #endif #if defined(HYPRE_USING_SYCL) @@ -346,6 +348,8 @@ struct hypre_GpuMatData #endif //#if defined(HYPRE_USING_GPU) +/* WM: todo - is this how I want to integrate the functionality below? Do I really need all this? */ +/* NOTE: It doesn't line up that nicely with the cuda/hip implementation since you need to pass item agrs */ #if defined(HYPRE_USING_SYCL) /* return the number of work-items in current work-group */ template @@ -506,6 +510,8 @@ using namespace thrust::placeholders; #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } #elif defined(HYPRE_USING_HIP) #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } +#elif defined(HYPRE_USING_SYCL) +/* WM: todo? used below in HYPRE_CUDA_LAUNCH2 */ #endif #else // #if defined(HYPRE_DEBUG) #define GPU_LAUNCH_SYNC From 2fb3f27fd3b8766a1bf077d32f4587b3abe84c03 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Mon, 4 Oct 2021 17:04:05 -0700 Subject: [PATCH 13/44] Reproducing seg fault when trying to launch trivial reduction parallel_for --- src/test/Makefile | 2 +- src/test/simple.c | 49 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/src/test/Makefile b/src/test/Makefile index f7f5d5431d..b5910211c6 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -140,7 +140,7 @@ ij: ij.o ${LINK_CC} -o $@ $< ${LFLAGS} # WM: TODO: remove -simple: simple.o +simple: simple.obj @echo "Building" $@ "... " ${LINK_CC} -o $@ $< ${LFLAGS} diff --git a/src/test/simple.c b/src/test/simple.c index a52260e1df..e385aefe69 100644 --- a/src/test/simple.c +++ b/src/test/simple.c @@ -1,7 +1,5 @@ /* WM: todo - remove this file from git */ -#include "_hypre_utilities.h" -#include "_hypre_utilities.hpp" #include "HYPRE.h" #include "_hypre_struct_mv.h" #include "_hypre_struct_mv.hpp" @@ -145,10 +143,49 @@ main( hypre_int argc, { /* initialize */ - /* hypre_MPI_Init(&argc, &argv); */ - /* HYPRE_Init(); */ + hypre_MPI_Init(&argc, &argv); + HYPRE_Init(); /* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); */ + HYPRE_Int length = 1000; + const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + HYPRE_Real *arr = hypre_CTAlloc(HYPRE_Real, length, HYPRE_MEMORY_DEVICE); + HYPRE_Real sum_var = 0; + sycl::buffer sum_buf(&sum_var, 1); + + /* Reduction parallel_for with accessor */ + std::cout << "Launching parallel_for reduction with accessor" << std::endl; + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) + { + sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write); + + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), + [=] (sycl::nd_item<1> item, auto &sum) + { + /* trivial kernel */ + }); + }).wait_and_throw(); + + + + + HYPRE_Real *sum_var_usm = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); + + /* Reduction parallel_for with unified memory pointer */ + std::cout << "Launching parallel_for reduction with unified memory pointer" << std::endl; + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) + { + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_var_usm, sycl::ONEAPI::plus<>()), + [=] (sycl::nd_item<1> item, auto &sum) + { + /* trivial kernel */ + }); + }).wait_and_throw(); + + + + /* sycl::queue my_queue(sycl::default_selector{}, dpc_common::exception_handler); */ /* ShowDevice(my_queue); */ @@ -159,8 +196,8 @@ main( hypre_int argc, /* hypre_printf("is_cpu = %d\n", gpu.is_cpu()); */ /* hypre_printf("is_cpu = %d\n", dev.is_cpu()); */ /* hypre_printf("is_gpu = %d\n", gpu.is_gpu()); */ - /* hypre_printf("DONE\n"); */ - /* exit(0); */ + hypre_printf("DONE\n"); + exit(0); From 001fb9f5d15258034923aba09a909370e25118c4 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Tue, 5 Oct 2021 16:40:50 -0700 Subject: [PATCH 14/44] Reduction boxloops done The reduction boxloops are implemented and pass the struct solvers.sh tests. Cleanup of boxloop_sycl.h. --- src/struct_ls/HYPRE_struct_int.c | 4 +- src/struct_ls/pfmg_setup.c | 127 ++++++++++-- src/struct_mv/_hypre_struct_mv.hpp | 33 ++- src/struct_mv/boxloop_sycl.h | 320 ++++------------------------- src/struct_mv/struct_innerprod.c | 6 +- src/test/simple.c | 156 +++++++++++--- src/utilities/_hypre_utilities.hpp | 14 +- src/utilities/device_utils.c | 13 +- src/utilities/device_utils.h | 3 +- src/utilities/headers | 5 + 10 files changed, 332 insertions(+), 349 deletions(-) diff --git a/src/struct_ls/HYPRE_struct_int.c b/src/struct_ls/HYPRE_struct_int.c index abb1869fcd..e9048acbf7 100644 --- a/src/struct_ls/HYPRE_struct_int.c +++ b/src/struct_ls/HYPRE_struct_int.c @@ -71,9 +71,7 @@ hypre_StructVectorSetRandomValues( hypre_StructVector *vector, hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, v_data_box, start, unit_stride, vi); { -// WM: TODO: temporary fix... remove after sycl implementation is done -#if defined(HYPRE_USING_SYCL) -#elif defined(HYPRE_USING_GPU) +#if defined(HYPRE_USING_GPU) vp[vi] = rand_device[idx]; #else vp[vi] = 2.0*hypre_Rand() - 1.0; diff --git a/src/struct_ls/pfmg_setup.c b/src/struct_ls/pfmg_setup.c index fbf91d16d3..684824f26a 100644 --- a/src/struct_ls/pfmg_setup.c +++ b/src/struct_ls/pfmg_setup.c @@ -809,23 +809,18 @@ hypre_PFMGComputeDxyz( hypre_StructMatrix *A, switch (stencil_size) { case 5: - hypre_printf("WM: debug - stencil size = 5\n"); hypre_PFMGComputeDxyz_SS5 (i, A, cxyz, sqcxyz); break; case 9: - hypre_printf("WM: debug - stencil size = 9\n"); hypre_PFMGComputeDxyz_SS9 (i, A, cxyz, sqcxyz); break; case 7: - hypre_printf("WM: debug - stencil size = 7\n"); hypre_PFMGComputeDxyz_SS7 (i, A, cxyz, sqcxyz); break; case 19: - hypre_printf("WM: debug - stencil size = 19\n"); hypre_PFMGComputeDxyz_SS19(i, A, cxyz, sqcxyz); break; case 27: - hypre_printf("WM: debug - stencil size = 27\n"); hypre_PFMGComputeDxyz_SS27(i, A, cxyz, sqcxyz); break; default: @@ -1060,7 +1055,7 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int bi, #if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL) HYPRE_Real cxb = cxyz[0]; - hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, A_dbox, start, stride, Ai, cxb) { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; @@ -1071,10 +1066,10 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int bi, cxb += tcx; #endif } - hypre_newBoxLoop1ReductionEnd(Ai, cxb) + hypre_BoxLoop1ReductionEnd(Ai, cxb) HYPRE_Real cyb = cxyz[1]; - hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, A_dbox, start, stride, Ai, cyb) { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; @@ -1085,10 +1080,10 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int bi, cyb += tcy; #endif } - hypre_newBoxLoop1ReductionEnd(Ai, cyb) + hypre_BoxLoop1ReductionEnd(Ai, cyb) HYPRE_Real sqcxb = sqcxyz[0]; - hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, A_dbox, start, stride, Ai, sqcxb) { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; @@ -1099,10 +1094,10 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int bi, sqcxb += tcx * tcx; #endif } - hypre_newBoxLoop1ReductionEnd(Ai, sqcxb) + hypre_BoxLoop1ReductionEnd(Ai, sqcxb) HYPRE_Real sqcyb = sqcxyz[1]; - hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, + hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, A_dbox, start, stride, Ai, sqcyb) { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; @@ -1113,7 +1108,7 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int bi, sqcyb += tcy * tcy; #endif } - hypre_newBoxLoop1ReductionEnd(Ai, sqcyb) + hypre_BoxLoop1ReductionEnd(Ai, sqcyb) #else // #if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL) @@ -1262,7 +1257,7 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi, a_cne = hypre_StructMatrixExtractPointerByIndex(A, bi, index); // FIXME TODO HOW TO DO KOKKOS IN ONE BOXLOOP ? -#if defined(HYPRE_USING_KOKKOS) +#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL) HYPRE_Real cxb = cxyz[0]; hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, @@ -1270,7 +1265,11 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcx; +#else cxb += tcx; +#endif } hypre_BoxLoop1ReductionEnd(Ai, cxb) @@ -1280,7 +1279,11 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcy; +#else cyb += tcy; +#endif } hypre_BoxLoop1ReductionEnd(Ai, cyb) @@ -1290,7 +1293,11 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcx*tcx; +#else sqcxb += tcx*tcx; +#endif } hypre_BoxLoop1ReductionEnd(Ai, sqcxb) @@ -1300,7 +1307,11 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcy*tcy; +#else sqcyb += tcy*tcy; +#endif } hypre_BoxLoop1ReductionEnd(Ai, sqcyb) @@ -1437,7 +1448,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, a_bc = hypre_StructMatrixExtractPointerByIndex(A, bi, index); // FIXME TODO HOW TO DO KOKKOS IN ONE BOXLOOP ? -#if defined(HYPRE_USING_KOKKOS) +#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL) HYPRE_Real cxb = cxyz[0]; hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, @@ -1445,7 +1456,11 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcx; +#else cxb += tcx; +#endif } hypre_BoxLoop1ReductionEnd(Ai, cxb) @@ -1455,7 +1470,11 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcy; +#else cyb += tcy; +#endif } hypre_BoxLoop1ReductionEnd(Ai, cyb) @@ -1465,7 +1484,11 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcz; +#else czb += tcz; +#endif } hypre_BoxLoop1ReductionEnd(Ai, czb) @@ -1475,7 +1498,11 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcx*tcx; +#else sqcxb += tcx*tcx; +#endif } hypre_BoxLoop1ReductionEnd(Ai, sqcxb) @@ -1485,7 +1512,11 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcy*tcy; +#else sqcyb += tcy*tcy; +#endif } hypre_BoxLoop1ReductionEnd(Ai, sqcyb) @@ -1495,7 +1526,11 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcz*tcz; +#else sqczb += tcz*tcz; +#endif } hypre_BoxLoop1ReductionEnd(Ai, sqczb) @@ -1692,7 +1727,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, a_cne = hypre_StructMatrixExtractPointerByIndex(A, bi, index); // FIXME TODO HOW TO DO KOKKOS IN ONE BOXLOOP ? -#if defined(HYPRE_USING_KOKKOS) +#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL) HYPRE_Real cxb = cxyz[0]; hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, @@ -1700,7 +1735,11 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_aw[Ai] + a_ae[Ai] + a_bw[Ai] + a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcx; +#else cxb += tcx; +#endif } hypre_BoxLoop1ReductionEnd(Ai, cxb) @@ -1710,7 +1749,11 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcy; +#else cyb += tcy; +#endif } hypre_BoxLoop1ReductionEnd(Ai, cyb) @@ -1720,7 +1763,11 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] + a_bw[Ai] + a_be[Ai] + a_bn[Ai] + a_bs[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcz; +#else czb += tcz; +#endif } hypre_BoxLoop1ReductionEnd(Ai, czb) @@ -1730,7 +1777,11 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_aw[Ai] + a_ae[Ai] + a_bw[Ai] + a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcx*tcx; +#else sqcxb += tcx*tcx; +#endif } hypre_BoxLoop1ReductionEnd(Ai, sqcxb) @@ -1740,7 +1791,11 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcy*tcy; +#else sqcyb += tcy*tcy; +#endif } hypre_BoxLoop1ReductionEnd(Ai, sqcyb) @@ -1750,7 +1805,11 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, { HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] + a_bw[Ai] + a_be[Ai] + a_bn[Ai] + a_bs[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcz*tcz; +#else sqczb += tcz*tcz; +#endif } hypre_BoxLoop1ReductionEnd(Ai, sqczb) @@ -1988,7 +2047,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, a_bne = hypre_StructMatrixExtractPointerByIndex(A, bi, index); // FIXME TODO HOW TO DO KOKKOS IN ONE BOXLOOP ? -#if defined(HYPRE_USING_KOKKOS) +#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL) HYPRE_Real cxb = cxyz[0]; hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size, @@ -1998,7 +2057,11 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; tcx -= diag * (a_cw[Ai] + a_ce[Ai] + a_aw[Ai] + a_ae[Ai] + a_bw[Ai] + a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); tcx -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcx; +#else cxb += tcx; +#endif } hypre_BoxLoop1ReductionEnd(Ai, cxb) @@ -2010,7 +2073,11 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; tcy -= diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); tcy -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcy; +#else cyb += tcy; +#endif } hypre_BoxLoop1ReductionEnd(Ai, cyb) @@ -2022,7 +2089,11 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; tcz -= diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] + a_bw[Ai] + a_be[Ai] + a_bn[Ai] + a_bs[Ai]); tcz -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcz; +#else czb += tcz; +#endif } hypre_BoxLoop1ReductionEnd(Ai, czb) @@ -2034,7 +2105,11 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; tcx -= diag * (a_cw[Ai] + a_ce[Ai] + a_aw[Ai] + a_ae[Ai] + a_bw[Ai] + a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); tcx -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcx*tcx; +#else sqcxb += tcx*tcx; +#endif } hypre_BoxLoop1ReductionEnd(Ai, sqcxb) @@ -2046,7 +2121,11 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; tcy -= diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); tcy -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcy*tcy; +#else sqcyb += tcy*tcy; +#endif } hypre_BoxLoop1ReductionEnd(Ai, sqcyb); @@ -2058,7 +2137,11 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; tcz -= diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] + a_bw[Ai] + a_be[Ai] + a_bn[Ai] + a_bs[Ai]); tcz -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); +#if defined(HYPRE_USING_SYCL) + sum += tcz*tcz; +#else sqczb += tcz*tcz; +#endif } hypre_BoxLoop1ReductionEnd(Ai, sqczb) @@ -2198,7 +2281,7 @@ hypre_ZeroDiagonal( hypre_StructMatrix *A ) } else { -#if defined(HYPRE_USING_KOKKOS) +#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL) HYPRE_Real diag_product_local = diag_product; #elif defined(HYPRE_USING_RAJA) ReduceSum diag_product_local(diag_product); @@ -2226,11 +2309,19 @@ hypre_ZeroDiagonal( hypre_StructMatrix *A ) HYPRE_Real zero = 0.0; if (Ap[Ai] == 0.0) { +#if defined(HYPRE_USING_SYCL) + sum += one; +#else diag_product_local += one; +#endif } else { +#if defined(HYPRE_USING_SYCL) + sum += zero; +#else diag_product_local += zero; +#endif } } hypre_BoxLoop1ReductionEnd(Ai, diag_product_local); diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index 13982b41cb..86aded1b93 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1535,8 +1535,29 @@ else \ }, hypre__tot, sum_buf); \ } +/* Reduction BoxLoop2 */ +/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */ +/* Right now, it is hardcoded as a HYPRE_Real */ +#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, sum_var) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ + sycl::buffer sum_buf(&sum_var, 1); \ + ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_BoxLoopIncK(2, databox2, i2); - +#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var) \ + } \ + }, hypre__tot, sum_buf); \ +} @@ -1805,18 +1826,18 @@ else \ /* Reduction */ /* WM: todo - using CPU version for now */ #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ - zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) + hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) #define hypre_BoxLoop1ReductionEnd(i1, reducesum) \ - zypre_newBoxLoop1End(i1) + hypre_newBoxLoop1ReductionEnd(i1, reducesum) #define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ dbox2, start2, stride2, i2, reducesum) \ - zypre_newBoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ - dbox2, start2, stride2, i2) + hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, reducesum) #define hypre_BoxLoop2ReductionEnd(i1, i2, reducesum) \ - zypre_newBoxLoop2End(i1, i2) + hypre_newBoxLoop2ReductionEnd(i1, i2, reducesum) #endif diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index 0a4fae81f0..1c44ee3e08 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -106,24 +106,11 @@ ReductionBoxLoopforall( LOOP_BODY loop_body, } } - - - - - - - - - - - - - - #ifdef __cplusplus } #endif + /********************************************************************* * Init/Declare/IncK etc. *********************************************************************/ @@ -211,7 +198,7 @@ else \ /* WM: todo - double check that item.get_local_id(0) is actually what you want below */ #define hypre_newBoxLoopDeclare(box) \ hypre_Index local_idx; \ - HYPRE_Int idx_local = idx; \ + HYPRE_Int idx_local = idx; \ hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ idx_local = idx_local / box.lsize0; \ hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ @@ -236,7 +223,6 @@ else \ index[2] = hypre_IndexD(local_idx, 2); - /********************************************************************* * Boxloops *********************************************************************/ @@ -248,7 +234,7 @@ else \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -268,7 +254,7 @@ else \ hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -291,7 +277,7 @@ else \ hypre_BoxLoopDataDeclareK(3, ndim,loop_size, dbox3, start3, stride3); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -317,7 +303,7 @@ else \ hypre_BoxLoopDataDeclareK(4, ndim, loop_size, dbox4, start4, stride4); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -340,7 +326,7 @@ else \ hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -354,7 +340,7 @@ else \ hypre_BasicBoxLoopDataDeclareK(2, ndim, loop_size, stride2); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -365,273 +351,49 @@ else \ /* Reduction BoxLoop1 */ /* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */ /* Right now, it is hardcoded as a HYPRE_Real */ -#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var) \ -{ \ +#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var) \ +{ \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - sycl::buffer sum_buf(&sum_var, 1); \ - ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ + sycl::buffer sum_buf(&sum_var, 1); \ + ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ hypre_BoxLoopIncK(1, databox1, i1); -#define hypre_newBoxLoop1ReductionEnd(i1, sum_var) \ +#define hypre_newBoxLoop1ReductionEnd(i1, sum_var) \ } \ - }, hypre__tot, sum_buf); \ + }, hypre__tot, sum_buf); \ } +/* Reduction BoxLoop2 */ +/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */ +/* Right now, it is hardcoded as a HYPRE_Real */ +#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, sum_var) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ + sycl::buffer sum_buf(&sum_var, 1); \ + ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_BoxLoopIncK(2, databox2, i2); - - - - - - - - -/********************************************************************* - * HOST IMPLEMENTATION - *********************************************************************/ - -#ifdef HYPRE_USING_OPENMP -#define HYPRE_BOX_REDUCTION -#if defined(WIN32) && defined(_MSC_VER) -#define Pragma(x) __pragma(HYPRE_XSTR(x)) -#else -#define Pragma(x) _Pragma(HYPRE_XSTR(x)) -#endif -#define OMP0 Pragma(omp parallel for HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE) -#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE) -#else /* #ifdef HYPRE_USING_OPENMP */ -#define OMP0 -#define OMP1 -#endif /* #ifdef HYPRE_USING_OPENMP */ - -#define zypre_newBoxLoop0Begin(ndim, loop_size) \ -{ \ - zypre_BoxLoopDeclare(); \ - zypre_BoxLoopInit(ndim, loop_size); \ - OMP1 \ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ - { \ - zypre_BoxLoopSet(); \ - for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ - { \ - for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ - { - -#define zypre_newBoxLoop0End() \ - } \ - zypre_BoxLoopInc1(); \ - zypre_BoxLoopInc2(); \ - } \ - } \ -} - -#define zypre_newBoxLoop1Begin(ndim, loop_size, \ - dbox1, start1, stride1, i1) \ -{ \ - HYPRE_Int i1; \ - zypre_BoxLoopDeclare(); \ - zypre_BoxLoopDeclareK(1); \ - zypre_BoxLoopInit(ndim, loop_size); \ - zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ - OMP1 \ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ - { \ - HYPRE_Int i1; \ - zypre_BoxLoopSet(); \ - zypre_BoxLoopSetK(1, i1); \ - for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ - { \ - for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ - { - -#define zypre_newBoxLoop1End(i1) \ - i1 += hypre__i0inc1; \ - } \ - zypre_BoxLoopInc1(); \ - i1 += hypre__ikinc1[hypre__d]; \ - zypre_BoxLoopInc2(); \ - } \ - } \ -} - - -#define zypre_newBoxLoop2Begin(ndim, loop_size, \ - dbox1, start1, stride1, i1, \ - dbox2, start2, stride2, i2) \ -{ \ - HYPRE_Int i1, i2; \ - zypre_BoxLoopDeclare(); \ - zypre_BoxLoopDeclareK(1); \ - zypre_BoxLoopDeclareK(2); \ - zypre_BoxLoopInit(ndim, loop_size); \ - zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ - zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ - OMP1 \ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ - { \ - HYPRE_Int i1, i2; \ - zypre_BoxLoopSet(); \ - zypre_BoxLoopSetK(1, i1); \ - zypre_BoxLoopSetK(2, i2); \ - for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ - { \ - for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ - { - -#define zypre_newBoxLoop2End(i1, i2) \ - i1 += hypre__i0inc1; \ - i2 += hypre__i0inc2; \ - } \ - zypre_BoxLoopInc1(); \ - i1 += hypre__ikinc1[hypre__d]; \ - i2 += hypre__ikinc2[hypre__d]; \ - zypre_BoxLoopInc2(); \ - } \ - } \ -} - - -#define zypre_newBoxLoop3Begin(ndim, loop_size, \ - dbox1, start1, stride1, i1, \ - dbox2, start2, stride2, i2, \ - dbox3, start3, stride3, i3) \ -{ \ - HYPRE_Int i1, i2, i3; \ - zypre_BoxLoopDeclare(); \ - zypre_BoxLoopDeclareK(1); \ - zypre_BoxLoopDeclareK(2); \ - zypre_BoxLoopDeclareK(3); \ - zypre_BoxLoopInit(ndim, loop_size); \ - zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ - zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ - zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3); \ - OMP1 \ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ - { \ - HYPRE_Int i1, i2, i3; \ - zypre_BoxLoopSet(); \ - zypre_BoxLoopSetK(1, i1); \ - zypre_BoxLoopSetK(2, i2); \ - zypre_BoxLoopSetK(3, i3); \ - for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ - { \ - for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ - { - -#define zypre_newBoxLoop3End(i1, i2, i3) \ - i1 += hypre__i0inc1; \ - i2 += hypre__i0inc2; \ - i3 += hypre__i0inc3; \ - } \ - zypre_BoxLoopInc1(); \ - i1 += hypre__ikinc1[hypre__d]; \ - i2 += hypre__ikinc2[hypre__d]; \ - i3 += hypre__ikinc3[hypre__d]; \ - zypre_BoxLoopInc2(); \ - } \ - } \ -} - -#define zypre_newBoxLoop4Begin(ndim, loop_size, \ - dbox1, start1, stride1, i1, \ - dbox2, start2, stride2, i2, \ - dbox3, start3, stride3, i3, \ - dbox4, start4, stride4, i4) \ -{ \ - HYPRE_Int i1, i2, i3, i4; \ - zypre_BoxLoopDeclare(); \ - zypre_BoxLoopDeclareK(1); \ - zypre_BoxLoopDeclareK(2); \ - zypre_BoxLoopDeclareK(3); \ - zypre_BoxLoopDeclareK(4); \ - zypre_BoxLoopInit(ndim, loop_size); \ - zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ - zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ - zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3); \ - zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4); \ - OMP1 \ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ - { \ - HYPRE_Int i1, i2, i3, i4; \ - zypre_BoxLoopSet(); \ - zypre_BoxLoopSetK(1, i1); \ - zypre_BoxLoopSetK(2, i2); \ - zypre_BoxLoopSetK(3, i3); \ - zypre_BoxLoopSetK(4, i4); \ - for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ - { \ - for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ - { - -#define zypre_newBoxLoop4End(i1, i2, i3, i4) \ - i1 += hypre__i0inc1; \ - i2 += hypre__i0inc2; \ - i3 += hypre__i0inc3; \ - i4 += hypre__i0inc4; \ - } \ - zypre_BoxLoopInc1(); \ - i1 += hypre__ikinc1[hypre__d]; \ - i2 += hypre__ikinc2[hypre__d]; \ - i3 += hypre__ikinc3[hypre__d]; \ - i4 += hypre__ikinc4[hypre__d]; \ - zypre_BoxLoopInc2(); \ - } \ - } \ -} - -#define zypre_newBasicBoxLoop2Begin(ndim, loop_size, \ - stride1, i1, \ - stride2, i2) \ -{ \ - zypre_BoxLoopDeclare(); \ - zypre_BoxLoopDeclareK(1); \ - zypre_BoxLoopDeclareK(2); \ - zypre_BoxLoopInit(ndim, loop_size); \ - zypre_BasicBoxLoopInitK(1, stride1); \ - zypre_BasicBoxLoopInitK(2, stride2); \ - OMP1 \ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ - { \ - HYPRE_Int i1, i2; \ - zypre_BoxLoopSet(); \ - zypre_BoxLoopSetK(1, i1); \ - zypre_BoxLoopSetK(2, i2); \ - for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ - { \ - for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ - { - - -#define hypre_LoopBegin(size, idx) \ -{ \ - HYPRE_Int idx; \ - OMP0 \ - for (idx = 0; idx < size; idx ++) \ - { - -#define hypre_LoopEnd() \ - } \ +#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var) \ + } \ + }, hypre__tot, sum_buf); \ } - - - - - - - - - - - - /********************************************************************* * renamings *********************************************************************/ @@ -653,17 +415,17 @@ else \ /* Reduction */ /* WM: todo - using CPU version for now */ #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ - zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) + hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) #define hypre_BoxLoop1ReductionEnd(i1, reducesum) \ - zypre_newBoxLoop1End(i1) + hypre_newBoxLoop1ReductionEnd(i1, reducesum) #define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ dbox2, start2, stride2, i2, reducesum) \ - zypre_newBoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \ - dbox2, start2, stride2, i2) + hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, reducesum) #define hypre_BoxLoop2ReductionEnd(i1, i2, reducesum) \ - zypre_newBoxLoop2End(i1, i2) + hypre_newBoxLoop2ReductionEnd(i1, i2, reducesum) #endif diff --git a/src/struct_mv/struct_innerprod.c b/src/struct_mv/struct_innerprod.c index 497cd4280a..cfef661cb0 100644 --- a/src/struct_mv/struct_innerprod.c +++ b/src/struct_mv/struct_innerprod.c @@ -62,7 +62,7 @@ hypre_StructInnerProd( hypre_StructVector *x, hypre_BoxGetSize(box, loop_size); -#if defined(HYPRE_USING_KOKKOS) +#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL) HYPRE_Real box_sum = 0.0; #elif defined(HYPRE_USING_RAJA) ReduceSum box_sum(0.0); @@ -89,7 +89,11 @@ hypre_StructInnerProd( hypre_StructVector *x, box_sum) { HYPRE_Real tmp = xp[xi] * hypre_conj(yp[yi]); +#if defined(HYPRE_USING_SYCL) + sum += tmp; +#else box_sum += tmp; +#endif } hypre_BoxLoop2ReductionEnd(xi, yi, box_sum); diff --git a/src/test/simple.c b/src/test/simple.c index e385aefe69..ff5e40b103 100644 --- a/src/test/simple.c +++ b/src/test/simple.c @@ -141,47 +141,135 @@ hypre_int main( hypre_int argc, char *argv[] ) { + /* hypre_MPI_Init(&argc, &argv); */ + /* HYPRE_Init(); */ + /* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); */ + + + /* return 0; */ + +/******************************************************************************/ +/******************************************************************************/ + + /* Get device */ + /* sycl::device syclDev = sycl::device(sycl::default_selector{}); */ + + /* /1* Get asynchandler *1/ */ + /* auto sycl_asynchandler = [] (sycl::exception_list exceptions) */ + /* { */ + /* for (std::exception_ptr const& e : exceptions) */ + /* { */ + /* try */ + /* { */ + /* std::rethrow_exception(e); */ + /* } */ + /* catch (sycl::exception const& ex) */ + /* { */ + /* std::cout << "Caught asynchronous SYCL exception:" << std::endl */ + /* << ex.what() << ", OpenCL code: " << ex.get_cl_code() << std::endl; */ + /* } */ + /* } */ + /* }; */ + + /* /1* Setup sycl context *1/ */ + /* sycl::context syclctxt = sycl::context(syclDev, sycl_asynchandler); */ + + /* /1* Setup queue *1/ */ + /* sycl::queue *my_queue = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); */ + + /* /1* Show device associated with queue *1/ */ + /* ShowDevice(*my_queue); */ + + /* return 0; */ + + + +/******************************************************************************/ +/******************************************************************************/ + + int length = 1024; + + sycl::default_selector selector; + sycl::queue myq(selector); + std::cout<<"Running on: "<()<<"\n"; + + auto A = sycl::malloc_shared(length, myq); + + auto gr = sycl::range<1>(length); + auto lr = sycl::range<1>(32); //change me, too small? + + + for(int i=0;i(i+1); //initialize + + //MAKE SURE I"M HOST & DEVICE ACCESSIBLE! + auto fsum = sycl::malloc_shared(1, myq); + + { + myq.submit( [&](auto &h) { + /* auto properties = sycl::property::reduction::initialize_to_identity{}; */ + h.parallel_for(sycl::nd_range<1>(gr,lr), + sycl::ONEAPI::reduction(fsum, std::plus<>()), + [=](sycl::nd_item<1> it, auto &sum){ + int i = it.get_global_id(0); + sum += A[i]; + }); + }).wait_and_throw(); + } + + printf("sum: %f\n",fsum[0]); + return 0; + + +/******************************************************************************/ +/******************************************************************************/ + + + /* initialize */ - hypre_MPI_Init(&argc, &argv); - HYPRE_Init(); - /* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); */ + /* hypre_MPI_Init(&argc, &argv); */ + /* HYPRE_Init(); */ + /* /1* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); *1/ */ - HYPRE_Int length = 1000; - const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); - const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); - HYPRE_Real *arr = hypre_CTAlloc(HYPRE_Real, length, HYPRE_MEMORY_DEVICE); - HYPRE_Real sum_var = 0; - sycl::buffer sum_buf(&sum_var, 1); + /* HYPRE_Int length = 1000; */ + /* const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); */ + /* const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); */ + /* HYPRE_Real *arr = hypre_CTAlloc(HYPRE_Real, length, HYPRE_MEMORY_DEVICE); */ + /* HYPRE_Real sum_var = 0; */ + /* /1* sycl::buffer sum_buf(&sum_var, 1); *1/ */ + /* sycl::buffer sum_buf{&sum_var, 1}; */ - /* Reduction parallel_for with accessor */ - std::cout << "Launching parallel_for reduction with accessor" << std::endl; - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) - { - sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write); + /* /1* Reduction parallel_for with accessor *1/ */ + /* std::cout << "Launching parallel_for reduction with accessor" << std::endl; */ + /* hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) */ + /* { */ + /* sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write); */ + /* /1* auto sumReduction = sycl::reduction(sum_buf, cgh, sycl::plus<>()); *1/ */ - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), - [=] (sycl::nd_item<1> item, auto &sum) - { - /* trivial kernel */ - }); - }).wait_and_throw(); + /* /1* WM: NOTE - on JLSE, ONEAPI is marked as deprecated to be replaced by ext::oneapi *1/ */ + /* cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), */ + /* /1* cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sumReduction, *1/ */ + /* [=] (sycl::nd_item<1> item, auto &sum) */ + /* { */ + /* /1* trivial kernel *1/ */ + /* }); */ + /* }).wait_and_throw(); */ - HYPRE_Real *sum_var_usm = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); +/* HYPRE_Real *sum_var_usm = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); */ - /* Reduction parallel_for with unified memory pointer */ - std::cout << "Launching parallel_for reduction with unified memory pointer" << std::endl; - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) - { - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_var_usm, sycl::ONEAPI::plus<>()), - [=] (sycl::nd_item<1> item, auto &sum) - { - /* trivial kernel */ - }); - }).wait_and_throw(); +/* /1* Reduction parallel_for with unified memory pointer *1/ */ +/* std::cout << "Launching parallel_for reduction with unified memory pointer" << std::endl; */ +/* hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) */ +/* { */ +/* cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_var_usm, sycl::ONEAPI::plus<>()), */ +/* [=] (sycl::nd_item<1> item, auto &sum) */ +/* { */ +/* /1* trivial kernel *1/ */ +/* }); */ +/* }).wait_and_throw(); */ @@ -196,9 +284,11 @@ main( hypre_int argc, /* hypre_printf("is_cpu = %d\n", gpu.is_cpu()); */ /* hypre_printf("is_cpu = %d\n", dev.is_cpu()); */ /* hypre_printf("is_gpu = %d\n", gpu.is_gpu()); */ - hypre_printf("DONE\n"); - exit(0); + /* hypre_printf("DONE\n"); */ + /* exit(0); */ +/******************************************************************************/ +/******************************************************************************/ /* variables */ diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 61e8ae0998..6fe8451f0f 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -4,6 +4,11 @@ #ifndef hypre_UTILITIES_HPP #define hypre_UTILITIES_HPP +/* WM: todo - I have a problem where I need to include this outside the extern "C++" {} block, so I'm doing this manually here for now */ +#if defined(HYPRE_USING_SYCL) +#include +#endif + #ifdef __cplusplus extern "C++" { #endif @@ -105,7 +110,9 @@ struct hypre_device_allocator #elif defined(HYPRE_USING_SYCL) -#include +/* WM: todo - if the include for CL/sycl.hpp is inside extern "C++" {}, I get problems with sycl reductions... totally strange, but true */ +/* #include */ +/* WM: todo - include below as necessary */ /* #include */ /* #include */ /* #include */ @@ -278,6 +285,7 @@ struct hypre_DeviceData #endif #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + /* WM: question - what is the device_allocator? */ hypre_device_allocator device_allocator; #endif #if defined(HYPRE_USING_SYCL) @@ -398,6 +406,8 @@ struct hypre_GpuMatData #endif //#if defined(HYPRE_USING_GPU) +/* WM: todo - is this how I want to integrate the functionality below? Do I really need all this? */ +/* NOTE: It doesn't line up that nicely with the cuda/hip implementation since you need to pass item agrs */ #if defined(HYPRE_USING_SYCL) /* return the number of work-items in current work-group */ template @@ -558,6 +568,8 @@ using namespace thrust::placeholders; #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } #elif defined(HYPRE_USING_HIP) #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } +#elif defined(HYPRE_USING_SYCL) +/* WM: todo? used below in HYPRE_CUDA_LAUNCH2 */ #endif #else // #if defined(HYPRE_DEBUG) #define GPU_LAUNCH_SYNC diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index 5b028a0ca1..0437d65175 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -9,7 +9,6 @@ #include "_hypre_utilities.hpp" #if defined(HYPRE_USING_SYCL) -#include // WM: TODO: verify sycl::range<1> hypre_GetDefaultCUDABlockDimension() { @@ -975,10 +974,10 @@ hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i) }; /* WM: having trouble with getting the device on frank, so temporarily just passing the default selector */ - /* sycl::device syclDev = data->device; */ - /* sycl::context syclctxt = sycl::context(syclDev, sycl_asynchandler); */ - /* stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); */ - stream = new sycl::queue(sycl::default_selector{}, sycl::property_list{sycl::property::queue::in_order{}}); + sycl::device syclDev = data->device; + sycl::context syclctxt = sycl::context(syclDev, sycl_asynchandler); + stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); + /* stream = new sycl::queue(sycl::default_selector{}, sycl::property_list{sycl::property::queue::in_order{}}); */ data->streams[i] = stream; } #endif @@ -1235,8 +1234,8 @@ hypre_DeviceDataCreate() hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST); #if defined(HYPRE_USING_SYCL) - /* WM: commenting out for now since I'm having trouble finding the device on frank */ - /* hypre_DeviceDataDevice(data) = sycl::device(sycl::gpu_selector{}); */ + /* WM: does the default selector get a GPU if available? */ + hypre_DeviceDataDevice(data) = sycl::device(sycl::default_selector{}); #else hypre_DeviceDataDevice(data) = 0; #endif diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 9d95075c3e..a442f2229f 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -53,7 +53,8 @@ #elif defined(HYPRE_USING_SYCL) -#include +/* WM: todo - if the include for CL/sycl.hpp is inside extern "C++" {}, I get problems with sycl reductions... totally strange, but true */ +/* #include */ /* WM: todo - include below as necessary */ /* #include */ /* #include */ diff --git a/src/utilities/headers b/src/utilities/headers index 6d54d6d434..d3a0e28dba 100755 --- a/src/utilities/headers +++ b/src/utilities/headers @@ -79,6 +79,11 @@ cat > $INTERNAL_HEADER <<@ #ifndef hypre_UTILITIES_HPP #define hypre_UTILITIES_HPP +/* WM: todo - I have a problem where I need to include this outside the extern "C++" {} block, so I'm doing this manually here for now */ +#if defined(HYPRE_USING_SYCL) +#include +#endif + #ifdef __cplusplus extern "C++" { #endif From df301df9bcc5c197f221e334089125a803d410cc Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Tue, 5 Oct 2021 17:00:13 -0700 Subject: [PATCH 15/44] Cleanup --- src/configure | 1 - src/struct_mv/_hypre_struct_mv.hpp | 317 ++------------ src/struct_mv/boxloop_sycl.h | 2 - src/test/Makefile | 5 - src/test/TEST_ij/solvers.jobs | 145 ++++--- src/test/simple.c | 642 ----------------------------- src/utilities/device_utils.c | 7 +- src/utilities/general.c | 40 +- src/utilities/memory.c | 7 - 9 files changed, 122 insertions(+), 1044 deletions(-) delete mode 100644 src/test/simple.c diff --git a/src/configure b/src/configure index 7b8443595b..bb48dbdf9b 100755 --- a/src/configure +++ b/src/configure @@ -9081,7 +9081,6 @@ fi if test x"$hypre_using_sycl" == x"yes"; then : -# WM: not setting this with sycl for now since it is giving me problems $as_echo "#define HYPRE_USING_GPU 1" >>confdefs.h $as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index 86aded1b93..e4824ec744 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1258,24 +1258,11 @@ ReductionBoxLoopforall( LOOP_BODY loop_body, } } - - - - - - - - - - - - - - #ifdef __cplusplus } #endif + /********************************************************************* * Init/Declare/IncK etc. *********************************************************************/ @@ -1360,10 +1347,9 @@ else \ } /* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */ -/* WM: todo - double check that item.get_local_id(0) is actually what you want below */ #define hypre_newBoxLoopDeclare(box) \ hypre_Index local_idx; \ - HYPRE_Int idx_local = idx; \ + HYPRE_Int idx_local = idx; \ hypre_IndexD(local_idx, 0) = idx_local % box.lsize0; \ idx_local = idx_local / box.lsize0; \ hypre_IndexD(local_idx, 1) = idx_local % box.lsize1; \ @@ -1388,7 +1374,6 @@ else \ index[2] = hypre_IndexD(local_idx, 2); - /********************************************************************* * Boxloops *********************************************************************/ @@ -1400,7 +1385,7 @@ else \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -1420,7 +1405,7 @@ else \ hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -1443,7 +1428,7 @@ else \ hypre_BoxLoopDataDeclareK(3, ndim,loop_size, dbox3, start3, stride3); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -1469,7 +1454,7 @@ else \ hypre_BoxLoopDataDeclareK(4, ndim, loop_size, dbox4, start4, stride4); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -1492,7 +1477,7 @@ else \ hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -1506,7 +1491,7 @@ else \ hypre_BasicBoxLoopDataDeclareK(2, ndim, loop_size, stride2); \ BoxLoopforall( [=] (sycl::nd_item<1> item) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ @@ -1517,294 +1502,49 @@ else \ /* Reduction BoxLoop1 */ /* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */ /* Right now, it is hardcoded as a HYPRE_Real */ -#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var) \ -{ \ +#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var) \ +{ \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - sycl::buffer sum_buf(&sum_var, 1); \ - ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ + sycl::buffer sum_buf(&sum_var, 1); \ + ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ hypre_newBoxLoopDeclare(databox1); \ hypre_BoxLoopIncK(1, databox1, i1); -#define hypre_newBoxLoop1ReductionEnd(i1, sum_var) \ +#define hypre_newBoxLoop1ReductionEnd(i1, sum_var) \ } \ - }, hypre__tot, sum_buf); \ + }, hypre__tot, sum_buf); \ } /* Reduction BoxLoop2 */ /* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */ /* Right now, it is hardcoded as a HYPRE_Real */ -#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ - dbox2, start2, stride2, i2, sum_var) \ -{ \ - hypre_newBoxLoopInit(ndim, loop_size); \ - hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ - sycl::buffer sum_buf(&sum_var, 1); \ - ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ +#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ + dbox2, start2, stride2, i2, sum_var) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ + hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ + sycl::buffer sum_buf(&sum_var, 1); \ + ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ { \ - HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ { \ - hypre_newBoxLoopDeclare(databox1); \ - hypre_BoxLoopIncK(1, databox1, i1); \ + hypre_newBoxLoopDeclare(databox1); \ + hypre_BoxLoopIncK(1, databox1, i1); \ hypre_BoxLoopIncK(2, databox2, i2); -#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var) \ +#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var) \ } \ - }, hypre__tot, sum_buf); \ -} - - - - - - - -/********************************************************************* - * HOST IMPLEMENTATION - *********************************************************************/ - -#ifdef HYPRE_USING_OPENMP -#define HYPRE_BOX_REDUCTION -#if defined(WIN32) && defined(_MSC_VER) -#define Pragma(x) __pragma(HYPRE_XSTR(x)) -#else -#define Pragma(x) _Pragma(HYPRE_XSTR(x)) -#endif -#define OMP0 Pragma(omp parallel for HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE) -#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE) -#else /* #ifdef HYPRE_USING_OPENMP */ -#define OMP0 -#define OMP1 -#endif /* #ifdef HYPRE_USING_OPENMP */ - -#define zypre_newBoxLoop0Begin(ndim, loop_size) \ -{ \ - zypre_BoxLoopDeclare(); \ - zypre_BoxLoopInit(ndim, loop_size); \ - OMP1 \ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ - { \ - zypre_BoxLoopSet(); \ - for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ - { \ - for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ - { - -#define zypre_newBoxLoop0End() \ - } \ - zypre_BoxLoopInc1(); \ - zypre_BoxLoopInc2(); \ - } \ - } \ -} - -#define zypre_newBoxLoop1Begin(ndim, loop_size, \ - dbox1, start1, stride1, i1) \ -{ \ - HYPRE_Int i1; \ - zypre_BoxLoopDeclare(); \ - zypre_BoxLoopDeclareK(1); \ - zypre_BoxLoopInit(ndim, loop_size); \ - zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ - OMP1 \ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ - { \ - HYPRE_Int i1; \ - zypre_BoxLoopSet(); \ - zypre_BoxLoopSetK(1, i1); \ - for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ - { \ - for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ - { - -#define zypre_newBoxLoop1End(i1) \ - i1 += hypre__i0inc1; \ - } \ - zypre_BoxLoopInc1(); \ - i1 += hypre__ikinc1[hypre__d]; \ - zypre_BoxLoopInc2(); \ - } \ - } \ -} - - -#define zypre_newBoxLoop2Begin(ndim, loop_size, \ - dbox1, start1, stride1, i1, \ - dbox2, start2, stride2, i2) \ -{ \ - HYPRE_Int i1, i2; \ - zypre_BoxLoopDeclare(); \ - zypre_BoxLoopDeclareK(1); \ - zypre_BoxLoopDeclareK(2); \ - zypre_BoxLoopInit(ndim, loop_size); \ - zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ - zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ - OMP1 \ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ - { \ - HYPRE_Int i1, i2; \ - zypre_BoxLoopSet(); \ - zypre_BoxLoopSetK(1, i1); \ - zypre_BoxLoopSetK(2, i2); \ - for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ - { \ - for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ - { - -#define zypre_newBoxLoop2End(i1, i2) \ - i1 += hypre__i0inc1; \ - i2 += hypre__i0inc2; \ - } \ - zypre_BoxLoopInc1(); \ - i1 += hypre__ikinc1[hypre__d]; \ - i2 += hypre__ikinc2[hypre__d]; \ - zypre_BoxLoopInc2(); \ - } \ - } \ + }, hypre__tot, sum_buf); \ } -#define zypre_newBoxLoop3Begin(ndim, loop_size, \ - dbox1, start1, stride1, i1, \ - dbox2, start2, stride2, i2, \ - dbox3, start3, stride3, i3) \ -{ \ - HYPRE_Int i1, i2, i3; \ - zypre_BoxLoopDeclare(); \ - zypre_BoxLoopDeclareK(1); \ - zypre_BoxLoopDeclareK(2); \ - zypre_BoxLoopDeclareK(3); \ - zypre_BoxLoopInit(ndim, loop_size); \ - zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ - zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ - zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3); \ - OMP1 \ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ - { \ - HYPRE_Int i1, i2, i3; \ - zypre_BoxLoopSet(); \ - zypre_BoxLoopSetK(1, i1); \ - zypre_BoxLoopSetK(2, i2); \ - zypre_BoxLoopSetK(3, i3); \ - for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ - { \ - for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ - { - -#define zypre_newBoxLoop3End(i1, i2, i3) \ - i1 += hypre__i0inc1; \ - i2 += hypre__i0inc2; \ - i3 += hypre__i0inc3; \ - } \ - zypre_BoxLoopInc1(); \ - i1 += hypre__ikinc1[hypre__d]; \ - i2 += hypre__ikinc2[hypre__d]; \ - i3 += hypre__ikinc3[hypre__d]; \ - zypre_BoxLoopInc2(); \ - } \ - } \ -} - -#define zypre_newBoxLoop4Begin(ndim, loop_size, \ - dbox1, start1, stride1, i1, \ - dbox2, start2, stride2, i2, \ - dbox3, start3, stride3, i3, \ - dbox4, start4, stride4, i4) \ -{ \ - HYPRE_Int i1, i2, i3, i4; \ - zypre_BoxLoopDeclare(); \ - zypre_BoxLoopDeclareK(1); \ - zypre_BoxLoopDeclareK(2); \ - zypre_BoxLoopDeclareK(3); \ - zypre_BoxLoopDeclareK(4); \ - zypre_BoxLoopInit(ndim, loop_size); \ - zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1); \ - zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2); \ - zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3); \ - zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4); \ - OMP1 \ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ - { \ - HYPRE_Int i1, i2, i3, i4; \ - zypre_BoxLoopSet(); \ - zypre_BoxLoopSetK(1, i1); \ - zypre_BoxLoopSetK(2, i2); \ - zypre_BoxLoopSetK(3, i3); \ - zypre_BoxLoopSetK(4, i4); \ - for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ - { \ - for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ - { - -#define zypre_newBoxLoop4End(i1, i2, i3, i4) \ - i1 += hypre__i0inc1; \ - i2 += hypre__i0inc2; \ - i3 += hypre__i0inc3; \ - i4 += hypre__i0inc4; \ - } \ - zypre_BoxLoopInc1(); \ - i1 += hypre__ikinc1[hypre__d]; \ - i2 += hypre__ikinc2[hypre__d]; \ - i3 += hypre__ikinc3[hypre__d]; \ - i4 += hypre__ikinc4[hypre__d]; \ - zypre_BoxLoopInc2(); \ - } \ - } \ -} - -#define zypre_newBasicBoxLoop2Begin(ndim, loop_size, \ - stride1, i1, \ - stride2, i2) \ -{ \ - zypre_BoxLoopDeclare(); \ - zypre_BoxLoopDeclareK(1); \ - zypre_BoxLoopDeclareK(2); \ - zypre_BoxLoopInit(ndim, loop_size); \ - zypre_BasicBoxLoopInitK(1, stride1); \ - zypre_BasicBoxLoopInitK(2, stride2); \ - OMP1 \ - for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) \ - { \ - HYPRE_Int i1, i2; \ - zypre_BoxLoopSet(); \ - zypre_BoxLoopSetK(1, i1); \ - zypre_BoxLoopSetK(2, i2); \ - for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++) \ - { \ - for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++) \ - { - - -#define hypre_LoopBegin(size, idx) \ -{ \ - HYPRE_Int idx; \ - OMP0 \ - for (idx = 0; idx < size; idx ++) \ - { - -#define hypre_LoopEnd() \ - } \ -} - - - - - - - - - - - - - - /********************************************************************* * renamings *********************************************************************/ @@ -1824,7 +1564,6 @@ else \ #define hypre_BasicBoxLoop2Begin hypre_newBasicBoxLoop2Begin /* Reduction */ -/* WM: todo - using CPU version for now */ #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index 1c44ee3e08..311c235567 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -195,7 +195,6 @@ else \ } /* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */ -/* WM: todo - double check that item.get_local_id(0) is actually what you want below */ #define hypre_newBoxLoopDeclare(box) \ hypre_Index local_idx; \ HYPRE_Int idx_local = idx; \ @@ -413,7 +412,6 @@ else \ #define hypre_BasicBoxLoop2Begin hypre_newBasicBoxLoop2Begin /* Reduction */ -/* WM: todo - using CPU version for now */ #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \ hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) diff --git a/src/test/Makefile b/src/test/Makefile index b5910211c6..975e702290 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -139,11 +139,6 @@ ij: ij.o @echo "Building" $@ "... " ${LINK_CC} -o $@ $< ${LFLAGS} -# WM: TODO: remove -simple: simple.obj - @echo "Building" $@ "... " - ${LINK_CC} -o $@ $< ${LFLAGS} - ij_assembly: ij_assembly.o @echo "Building" $@ "... " ${LINK_CC} -o $@ $< ${LFLAGS} diff --git a/src/test/TEST_ij/solvers.jobs b/src/test/TEST_ij/solvers.jobs index d2f69b045e..f1c37d82ca 100755 --- a/src/test/TEST_ij/solvers.jobs +++ b/src/test/TEST_ij/solvers.jobs @@ -29,46 +29,45 @@ # 60: DS_FlexGMRES # #============================================================================= -# WM: TODO remove -exec_host -mpirun -np 2 ./ij -exec_host -solver 1 -rhsrand > solvers.out.0 -mpirun -np 2 ./ij -exec_host -solver 2 -rhsrand > solvers.out.1 -mpirun -np 2 ./ij -exec_host -solver 3 -rhsrand > solvers.out.2 -mpirun -np 2 ./ij -exec_host -solver 4 -rhsrand > solvers.out.3 -mpirun -np 2 ./ij -exec_host -solver 5 -rhsrand -w 0.67 -ns 2 > solvers.out.4 -mpirun -np 2 ./ij -exec_host -solver 6 -rhsrand > solvers.out.5 -#mpirun -np 2 ./ij -exec_host -solver 7 -rhsrand > solvers.out.6 -#mpirun -np 2 ./ij -exec_host -solver 8 -rhsrand > solvers.out.7 -mpirun -np 2 ./ij -exec_host -solver 20 -rhsrand > solvers.out.8 -mpirun -np 2 ./ij -exec_host -solver 20 -cf 0.5 -rhsrand > solvers.out.9 -mpirun -np 2 ./ij -exec_host -solver 20 -cf 0.5 -rhsrand -solver_type 2 > solvers.out.10 -mpirun -np 2 ./ij -exec_host -solver 20 -cf 0.5 -rhsrand -solver_type 3 > solvers.out.11 -mpirun -np 2 ./ij -exec_host -solver 16 -rhsrand > solvers.out.12 -mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand > solvers.out.13 -mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand -cgs 2 > solvers.out.14 -mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand -cgs 2 -unroll 8 > solvers.out.15 -mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand -unroll 4 > solvers.out.16 -mpirun -np 2 ./ij -exec_host -solver 3 -rhsrand -check_residual > solvers.out.17 -mpirun -np 2 ./ij -exec_host -solver 4 -rhsrand -check_residual > solvers.out.18 +mpirun -np 2 ./ij -solver 1 -rhsrand > solvers.out.0 +mpirun -np 2 ./ij -solver 2 -rhsrand > solvers.out.1 +mpirun -np 2 ./ij -solver 3 -rhsrand > solvers.out.2 +mpirun -np 2 ./ij -solver 4 -rhsrand > solvers.out.3 +mpirun -np 2 ./ij -solver 5 -rhsrand -w 0.67 -ns 2 > solvers.out.4 +mpirun -np 2 ./ij -solver 6 -rhsrand > solvers.out.5 +#mpirun -np 2 ./ij -solver 7 -rhsrand > solvers.out.6 +#mpirun -np 2 ./ij -solver 8 -rhsrand > solvers.out.7 +mpirun -np 2 ./ij -solver 20 -rhsrand > solvers.out.8 +mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand > solvers.out.9 +mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand -solver_type 2 > solvers.out.10 +mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand -solver_type 3 > solvers.out.11 +mpirun -np 2 ./ij -solver 16 -rhsrand > solvers.out.12 +mpirun -np 2 ./ij -solver 17 -rhsrand > solvers.out.13 +mpirun -np 2 ./ij -solver 17 -rhsrand -cgs 2 > solvers.out.14 +mpirun -np 2 ./ij -solver 17 -rhsrand -cgs 2 -unroll 8 > solvers.out.15 +mpirun -np 2 ./ij -solver 17 -rhsrand -unroll 4 > solvers.out.16 +mpirun -np 2 ./ij -solver 3 -rhsrand -check_residual > solvers.out.17 +mpirun -np 2 ./ij -solver 4 -rhsrand -check_residual > solvers.out.18 #systems AMG run ...unknown approach, hybrid approach, nodal approach -mpirun -np 2 ./ij -exec_host -n 20 20 20 -sysL 2 -nf 2 > solvers.out.sysu -mpirun -np 2 ./ij -exec_host -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 > solvers.out.sysh -mpirun -np 2 ./ij -exec_host -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 > solvers.out.sysn +mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 > solvers.out.sysu +mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 > solvers.out.sysh +mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 > solvers.out.sysn #LGMRS and FlexGMRES -mpirun -np 2 ./ij -exec_host -solver 50 -rhsrand > solvers.out.101 -mpirun -np 2 ./ij -exec_host -solver 51 -rhsrand > solvers.out.102 -mpirun -np 2 ./ij -exec_host -solver 60 -rhsrand > solvers.out.103 -mpirun -np 2 ./ij -exec_host -solver 61 -rhsrand > solvers.out.104 +mpirun -np 2 ./ij -solver 50 -rhsrand > solvers.out.101 +mpirun -np 2 ./ij -solver 51 -rhsrand > solvers.out.102 +mpirun -np 2 ./ij -solver 60 -rhsrand > solvers.out.103 +mpirun -np 2 ./ij -solver 61 -rhsrand > solvers.out.104 #agglomerated coarse grid solve -mpirun -np 8 ./ij -exec_host -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 > solvers.out.105 -mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 > solvers.out.107 +mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 > solvers.out.105 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 > solvers.out.107 #redundant coarse grid solve -mpirun -np 8 ./ij -exec_host -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 > solvers.out.106 -mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 > solvers.out.108 +mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 > solvers.out.106 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 > solvers.out.108 #additive cycles mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 > solvers.out.109 @@ -83,12 +82,12 @@ mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -ns 2 -ra mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -rlx 18 -ns 2 -rlx_coarse 18 -ns_coarse 2 > solvers.out.120 #nonGalerkin version -mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114 -mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 > solvers.out.115 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 > solvers.out.115 #RAP options -mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 > solvers.out.116 -mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 > solvers.out.116 +mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117 # # MGR and MGR-PCG @@ -96,26 +95,26 @@ mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out # coarse grid solver checks (1-level MGR == AMG (or coarse grid solver)) # Also checks for keeping coarse nodes to coarsest level # coarse grid size in output should be ~ mgr_num_reserved_nodes -mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 0 > solvers.out.200 -mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 100 > solvers.out.201 -mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 0 > solvers.out.202 -mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 100 > solvers.out.203 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 0 > solvers.out.200 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 100 > solvers.out.201 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 0 > solvers.out.202 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 100 > solvers.out.203 # multi level MGR tests with different coarse grid type strategies # Fix non C points to F points with different F-relaxation methods (single/multilevel F-relaxation) # with/ without reserved coarse nodes -mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.204 -mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.205 -mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.206 -mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.207 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.204 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.205 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.206 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.207 # Not fixed non C points to F points with different F-relaxation methods (single/multilevel F-relaxation) # with/ without reserved coarse nodes -mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.208 -mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.209 -mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.210 -mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.211 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.208 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.209 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.210 +mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.211 # MGR-PCG tests -mpirun -np 2 ./ij -exec_host -solver 71 -mgr_nlevels 0 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.212 -mpirun -np 2 ./ij -exec_host -solver 71 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.213 +mpirun -np 2 ./ij -solver 71 -mgr_nlevels 0 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.212 +mpirun -np 2 ./ij -solver 71 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.213 # # hypre_ILU tests @@ -124,39 +123,39 @@ mpirun -np 2 ./ij -exec_host -solver 71 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_t # Tests ILU-(Flex)GMRES # Test AMG with ILU as a complex smoother # -mpirun -np 1 ./ij -exec_host -solver 80 -ilu_type 0 -ilu_lfil 0 > solvers.out.300 -mpirun -np 1 ./ij -exec_host -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.301 -mpirun -np 1 ./ij -exec_host -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.302 +mpirun -np 1 ./ij -solver 80 -ilu_type 0 -ilu_lfil 0 > solvers.out.300 +mpirun -np 1 ./ij -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.301 +mpirun -np 1 ./ij -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.302 # parallel ILU # BJ -mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.303 -mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.304 +mpirun -np 2 ./ij -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.303 +mpirun -np 2 ./ij -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.304 # GMRES+ILU -mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 10 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.305 -mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.306 +mpirun -np 2 ./ij -solver 80 -ilu_type 10 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.305 +mpirun -np 2 ./ij -solver 80 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.306 # NSH+ILU -mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 20 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.307 -mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.308 +mpirun -np 2 ./ij -solver 80 -ilu_type 20 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.307 +mpirun -np 2 ./ij -solver 80 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.308 # RAS+ILU -mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 30 -ilu_lfil 1 > solvers.out.309 -mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.310 +mpirun -np 2 ./ij -solver 80 -ilu_type 30 -ilu_lfil 1 > solvers.out.309 +mpirun -np 2 ./ij -solver 80 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.310 # ddPQ-GMRES+ILU -mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 40 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.311 -mpirun -np 2 ./ij -exec_host -solver 80 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.312 +mpirun -np 2 ./ij -solver 80 -ilu_type 40 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.311 +mpirun -np 2 ./ij -solver 80 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.312 ## ILU-GMRES -mpirun -np 2 ./ij -exec_host -solver 81 -ilu_type 0 -ilu_lfil 0 > solvers.out.313 -mpirun -np 2 ./ij -exec_host -solver 81 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.314 -mpirun -np 2 ./ij -exec_host -solver 81 -ilu_type 30 -ilu_lfil 0 > solvers.out.315 -mpirun -np 2 ./ij -exec_host -solver 81 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.316 +mpirun -np 2 ./ij -solver 81 -ilu_type 0 -ilu_lfil 0 > solvers.out.313 +mpirun -np 2 ./ij -solver 81 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.314 +mpirun -np 2 ./ij -solver 81 -ilu_type 30 -ilu_lfil 0 > solvers.out.315 +mpirun -np 2 ./ij -solver 81 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.316 ## ILU-FlexGMRES -mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 10 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.317 -mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.318 -mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 20 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.319 -mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.320 -mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 40 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.321 -mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.322 +mpirun -np 2 ./ij -solver 82 -ilu_type 10 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.317 +mpirun -np 2 ./ij -solver 82 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.318 +mpirun -np 2 ./ij -solver 82 -ilu_type 20 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.319 +mpirun -np 2 ./ij -solver 82 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.320 +mpirun -np 2 ./ij -solver 82 -ilu_type 40 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.321 +mpirun -np 2 ./ij -solver 82 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5 > solvers.out.322 ## RAP-ILU -mpirun -np 2 ./ij -exec_host -solver 82 -ilu_type 50 -ilu_lfil 0 > solvers.out.323 +mpirun -np 2 ./ij -solver 82 -ilu_type 50 -ilu_lfil 0 > solvers.out.323 ## ILU smoother for AMG mpirun -np 2 ./ij -solver 0 -smtype 5 -smlv 1 -ilu_type 30 > solvers.out.324 mpirun -np 2 ./ij -solver 0 -smtype 15 -smlv 1 -ilu_type 30 > solvers.out.325 diff --git a/src/test/simple.c b/src/test/simple.c deleted file mode 100644 index ff5e40b103..0000000000 --- a/src/test/simple.c +++ /dev/null @@ -1,642 +0,0 @@ -/* WM: todo - remove this file from git */ - -#include "HYPRE.h" -#include "_hypre_struct_mv.h" -#include "_hypre_struct_mv.hpp" - -HYPRE_Int AddValuesVector( hypre_StructGrid *gridvector, - hypre_StructVector *zvector, - HYPRE_Int *period, - HYPRE_Real value ) ; - - - - -HYPRE_Int -cpu_hypre_StructVectorSetConstantValues( hypre_StructVector *vector, - HYPRE_Complex values ) -{ - hypre_Box *v_data_box; - - HYPRE_Complex *vp; - - hypre_BoxArray *boxes; - hypre_Box *box; - hypre_Index loop_size; - hypre_IndexRef start; - hypre_Index unit_stride; - - HYPRE_Int i; - - /*----------------------------------------------------------------------- - * Set the vector coefficients - *-----------------------------------------------------------------------*/ - - hypre_SetIndex(unit_stride, 1); - - boxes = hypre_StructGridBoxes(hypre_StructVectorGrid(vector)); - hypre_ForBoxI(i, boxes) - { - box = hypre_BoxArrayBox(boxes, i); - start = hypre_BoxIMin(box); - - v_data_box = - hypre_BoxArrayBox(hypre_StructVectorDataSpace(vector), i); - vp = hypre_StructVectorBoxData(vector, i); - - hypre_BoxGetSize(box, loop_size); - -#define DEVICE_VAR is_device_ptr(vp) - zypre_newBoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size, - v_data_box, start, unit_stride, vi); - { - vp[vi] = values; - } - zypre_newBoxLoop1End(vi); -#undef DEVICE_VAR - } - - return hypre_error_flag; -} - -HYPRE_Int -my_hypre_StructAxpy( HYPRE_Complex alpha, - hypre_StructVector *x, - hypre_StructVector *y ) -{ - hypre_Box *x_data_box; - hypre_Box *y_data_box; - - HYPRE_Complex *xp; - HYPRE_Complex *yp; - - hypre_BoxArray *boxes; - hypre_Box *box; - hypre_Index loop_size; - hypre_IndexRef start; - hypre_Index unit_stride; - - HYPRE_Int i; - - hypre_SetIndex(unit_stride, 1); - - boxes = hypre_StructGridBoxes(hypre_StructVectorGrid(y)); - hypre_ForBoxI(i, boxes) - { - box = hypre_BoxArrayBox(boxes, i); - start = hypre_BoxIMin(box); - - x_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(x), i); - y_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(y), i); - - xp = hypre_StructVectorBoxData(x, i); - yp = hypre_StructVectorBoxData(y, i); - - hypre_BoxGetSize(box, loop_size); - -#define DEVICE_VAR is_device_ptr(yp,xp) - /* WM: todo */ - /* my_hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size, */ - /* x_data_box, start, unit_stride, xi, */ - /* y_data_box, start, unit_stride, yi); */ - /* { */ - /* yp[yi] += alpha * xp[xi]; */ - /* } */ - /* my_hypre_BoxLoop2End(xi, yi); */ -#undef DEVICE_VAR - } - - return hypre_error_flag; -} - - -/**************************** - * show device function copied from oneAPI examples - ****************************/ -#include -#include "dpc_common.hpp" - -void ShowDevice(sycl::queue &q) { - using namespace std; - using namespace sycl; - // Output platform and device information. - auto device = q.get_device(); - auto p_name = device.get_platform().get_info(); - cout << std::setw(20) << "Platform Name: " << p_name << "\n"; - auto p_version = device.get_platform().get_info(); - cout << std::setw(20) << "Platform Version: " << p_version << "\n"; - auto d_name = device.get_info(); - cout << std::setw(20) << "Device Name: " << d_name << "\n"; - auto max_work_group = device.get_info(); - cout << std::setw(20) << "Max Work Group: " << max_work_group << "\n"; - auto max_compute_units = device.get_info(); - cout << std::setw(20) << "Max Compute Units: " << max_compute_units << "\n\n"; -} - -/**************************** - * main - ****************************/ - -hypre_int -main( hypre_int argc, - char *argv[] ) -{ - /* hypre_MPI_Init(&argc, &argv); */ - /* HYPRE_Init(); */ - /* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); */ - - - /* return 0; */ - -/******************************************************************************/ -/******************************************************************************/ - - /* Get device */ - /* sycl::device syclDev = sycl::device(sycl::default_selector{}); */ - - /* /1* Get asynchandler *1/ */ - /* auto sycl_asynchandler = [] (sycl::exception_list exceptions) */ - /* { */ - /* for (std::exception_ptr const& e : exceptions) */ - /* { */ - /* try */ - /* { */ - /* std::rethrow_exception(e); */ - /* } */ - /* catch (sycl::exception const& ex) */ - /* { */ - /* std::cout << "Caught asynchronous SYCL exception:" << std::endl */ - /* << ex.what() << ", OpenCL code: " << ex.get_cl_code() << std::endl; */ - /* } */ - /* } */ - /* }; */ - - /* /1* Setup sycl context *1/ */ - /* sycl::context syclctxt = sycl::context(syclDev, sycl_asynchandler); */ - - /* /1* Setup queue *1/ */ - /* sycl::queue *my_queue = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); */ - - /* /1* Show device associated with queue *1/ */ - /* ShowDevice(*my_queue); */ - - /* return 0; */ - - - -/******************************************************************************/ -/******************************************************************************/ - - int length = 1024; - - sycl::default_selector selector; - sycl::queue myq(selector); - std::cout<<"Running on: "<()<<"\n"; - - auto A = sycl::malloc_shared(length, myq); - - auto gr = sycl::range<1>(length); - auto lr = sycl::range<1>(32); //change me, too small? - - - for(int i=0;i(i+1); //initialize - - //MAKE SURE I"M HOST & DEVICE ACCESSIBLE! - auto fsum = sycl::malloc_shared(1, myq); - - { - myq.submit( [&](auto &h) { - /* auto properties = sycl::property::reduction::initialize_to_identity{}; */ - h.parallel_for(sycl::nd_range<1>(gr,lr), - sycl::ONEAPI::reduction(fsum, std::plus<>()), - [=](sycl::nd_item<1> it, auto &sum){ - int i = it.get_global_id(0); - sum += A[i]; - }); - }).wait_and_throw(); - } - - printf("sum: %f\n",fsum[0]); - return 0; - - -/******************************************************************************/ -/******************************************************************************/ - - - - - /* initialize */ - /* hypre_MPI_Init(&argc, &argv); */ - /* HYPRE_Init(); */ - /* /1* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); *1/ */ - - /* HYPRE_Int length = 1000; */ - /* const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); */ - /* const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); */ - /* HYPRE_Real *arr = hypre_CTAlloc(HYPRE_Real, length, HYPRE_MEMORY_DEVICE); */ - /* HYPRE_Real sum_var = 0; */ - /* /1* sycl::buffer sum_buf(&sum_var, 1); *1/ */ - /* sycl::buffer sum_buf{&sum_var, 1}; */ - - /* /1* Reduction parallel_for with accessor *1/ */ - /* std::cout << "Launching parallel_for reduction with accessor" << std::endl; */ - /* hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) */ - /* { */ - /* sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write); */ - /* /1* auto sumReduction = sycl::reduction(sum_buf, cgh, sycl::plus<>()); *1/ */ - - /* /1* WM: NOTE - on JLSE, ONEAPI is marked as deprecated to be replaced by ext::oneapi *1/ */ - /* cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), */ - /* /1* cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sumReduction, *1/ */ - /* [=] (sycl::nd_item<1> item, auto &sum) */ - /* { */ - /* /1* trivial kernel *1/ */ - /* }); */ - /* }).wait_and_throw(); */ - - - - -/* HYPRE_Real *sum_var_usm = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); */ - -/* /1* Reduction parallel_for with unified memory pointer *1/ */ -/* std::cout << "Launching parallel_for reduction with unified memory pointer" << std::endl; */ -/* hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) */ -/* { */ -/* cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_var_usm, sycl::ONEAPI::plus<>()), */ -/* [=] (sycl::nd_item<1> item, auto &sum) */ -/* { */ -/* /1* trivial kernel *1/ */ -/* }); */ -/* }).wait_and_throw(); */ - - - - - - /* sycl::queue my_queue(sycl::default_selector{}, dpc_common::exception_handler); */ - /* ShowDevice(my_queue); */ - - /* sycl::device gpu = sycl::device(sycl::cpu_selector{}); */ - /* sycl::device dev; */ - /* hypre_printf("is_host = %d\n", gpu.is_host()); */ - /* hypre_printf("is_cpu = %d\n", gpu.is_cpu()); */ - /* hypre_printf("is_cpu = %d\n", dev.is_cpu()); */ - /* hypre_printf("is_gpu = %d\n", gpu.is_gpu()); */ - /* hypre_printf("DONE\n"); */ - /* exit(0); */ - -/******************************************************************************/ -/******************************************************************************/ - - - /* variables */ - HYPRE_Int i, ix, iy, iz, ib; - HYPRE_Int p, q, r; - HYPRE_Int nx, ny, nz; - HYPRE_Int bx, by, bz; - HYPRE_Int nblocks; - HYPRE_Int dim; - HYPRE_Int sym; - HYPRE_Int **offsets; - HYPRE_Int **iupper; - HYPRE_Int **ilower; - HYPRE_Int periodic[3]; - HYPRE_Int istart[3]; - HYPRE_StructGrid grid; - HYPRE_StructVector b; - HYPRE_StructVector x; - HYPRE_Int num_ghost[6] = {0, 0, 0, 0, 0, 0}; - - dim = 3; - sym = 1; - nx = 10; - ny = 10; - nz = 10; - bx = 1; - by = 1; - bz = 1; - p = 1; - q = 1; - r = 1; - periodic[0] = 0; - periodic[1] = 0; - periodic[2] = 0; - istart[0] = -3; - istart[1] = -3; - istart[2] = -3; - - for (i = 0; i < 2*dim; i++) - { - num_ghost[i] = 1; - } - - switch (dim) - { - case 1: - nblocks = bx; - if(sym) - { - offsets = hypre_CTAlloc(HYPRE_Int*, 2, HYPRE_MEMORY_HOST); - offsets[0] = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST); - offsets[0][0] = -1; - offsets[1] = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST); - offsets[1][0] = 0; - } - else - { - offsets = hypre_CTAlloc(HYPRE_Int*, 3, HYPRE_MEMORY_HOST); - offsets[0] = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST); - offsets[0][0] = -1; - offsets[1] = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST); - offsets[1][0] = 0; - offsets[2] = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST); - offsets[2][0] = 1; - } - break; - - case 2: - nblocks = bx*by; - if(sym) - { - offsets = hypre_CTAlloc(HYPRE_Int*, 3, HYPRE_MEMORY_HOST); - offsets[0] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); - offsets[0][0] = -1; - offsets[0][1] = 0; - offsets[1] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); - offsets[1][0] = 0; - offsets[1][1] = -1; - offsets[2] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); - offsets[2][0] = 0; - offsets[2][1] = 0; - } - else - { - offsets = hypre_CTAlloc(HYPRE_Int*, 5, HYPRE_MEMORY_HOST); - offsets[0] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); - offsets[0][0] = -1; - offsets[0][1] = 0; - offsets[1] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); - offsets[1][0] = 0; - offsets[1][1] = -1; - offsets[2] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); - offsets[2][0] = 0; - offsets[2][1] = 0; - offsets[3] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); - offsets[3][0] = 1; - offsets[3][1] = 0; - offsets[4] = hypre_CTAlloc(HYPRE_Int, 2, HYPRE_MEMORY_HOST); - offsets[4][0] = 0; - offsets[4][1] = 1; - } - break; - - case 3: - nblocks = bx*by*bz; - if(sym) - { - offsets = hypre_CTAlloc(HYPRE_Int*, 4, HYPRE_MEMORY_HOST); - offsets[0] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); - offsets[0][0] = -1; - offsets[0][1] = 0; - offsets[0][2] = 0; - offsets[1] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); - offsets[1][0] = 0; - offsets[1][1] = -1; - offsets[1][2] = 0; - offsets[2] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); - offsets[2][0] = 0; - offsets[2][1] = 0; - offsets[2][2] = -1; - offsets[3] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); - offsets[3][0] = 0; - offsets[3][1] = 0; - offsets[3][2] = 0; - } - else - { - offsets = hypre_CTAlloc(HYPRE_Int*, 7, HYPRE_MEMORY_HOST); - offsets[0] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); - offsets[0][0] = -1; - offsets[0][1] = 0; - offsets[0][2] = 0; - offsets[1] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); - offsets[1][0] = 0; - offsets[1][1] = -1; - offsets[1][2] = 0; - offsets[2] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); - offsets[2][0] = 0; - offsets[2][1] = 0; - offsets[2][2] = -1; - offsets[3] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); - offsets[3][0] = 0; - offsets[3][1] = 0; - offsets[3][2] = 0; - offsets[4] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); - offsets[4][0] = 1; - offsets[4][1] = 0; - offsets[4][2] = 0; - offsets[5] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); - offsets[5][0] = 0; - offsets[5][1] = 1; - offsets[5][2] = 0; - offsets[6] = hypre_CTAlloc(HYPRE_Int, 3, HYPRE_MEMORY_HOST); - offsets[6][0] = 0; - offsets[6][1] = 0; - offsets[6][2] = 1; - } - break; - } - - - - /* initialize */ - hypre_MPI_Init(&argc, &argv); - HYPRE_Init(); - - /* prepare space for the extents */ - ilower = hypre_CTAlloc(HYPRE_Int*, nblocks, HYPRE_MEMORY_HOST); - iupper = hypre_CTAlloc(HYPRE_Int*, nblocks, HYPRE_MEMORY_HOST); - for (i = 0; i < nblocks; i++) - { - ilower[i] = hypre_CTAlloc(HYPRE_Int, dim, HYPRE_MEMORY_HOST); - iupper[i] = hypre_CTAlloc(HYPRE_Int, dim, HYPRE_MEMORY_HOST); - } - - /* compute ilower and iupper from (p,q,r), (bx,by,bz), and (nx,ny,nz) */ - ib = 0; - switch (dim) - { - case 1: - for (ix = 0; ix < bx; ix++) - { - ilower[ib][0] = istart[0]+ nx*(bx*p+ix); - iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1; - ib++; - } - break; - case 2: - for (iy = 0; iy < by; iy++) - for (ix = 0; ix < bx; ix++) - { - ilower[ib][0] = istart[0]+ nx*(bx*p+ix); - iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1; - ilower[ib][1] = istart[1]+ ny*(by*q+iy); - iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1; - ib++; - } - break; - case 3: - for (iz = 0; iz < bz; iz++) - for (iy = 0; iy < by; iy++) - for (ix = 0; ix < bx; ix++) - { - ilower[ib][0] = istart[0]+ nx*(bx*p+ix); - iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1; - ilower[ib][1] = istart[1]+ ny*(by*q+iy); - iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1; - ilower[ib][2] = istart[2]+ nz*(bz*r+iz); - iupper[ib][2] = istart[2]+ nz*(bz*r+iz+1) - 1; - ib++; - } - break; - } - /* create grid */ - HYPRE_StructGridCreate(hypre_MPI_COMM_WORLD, dim, &grid); - for (ib = 0; ib < nblocks; ib++) - { - /* Add to the grid a new box defined by ilower[ib], iupper[ib]...*/ - HYPRE_StructGridSetExtents(grid, ilower[ib], iupper[ib]); - } - HYPRE_StructGridSetPeriodic(grid, periodic); - HYPRE_StructGridSetNumGhost(grid, num_ghost); - HYPRE_StructGridAssemble(grid); - - /* create struct vectors */ - HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &b); - HYPRE_StructVectorInitialize(b); - AddValuesVector(grid,b,periodic,1.0); - HYPRE_StructVectorAssemble(b); - - HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &x); - HYPRE_StructVectorInitialize(x); - AddValuesVector(grid,x,periodic,1.0); - HYPRE_StructVectorAssemble(x); - - hypre_StructVector *y = hypre_StructVectorClone(x); - hypre_StructVectorPrint("before", x, 1); - - /* call set const */ - cpu_hypre_StructVectorSetConstantValues(y, 5.0); - hypre_printf("my_hypre_StructVectorSetConstantValues() success!\n"); - - hypre_StructVectorPrint("after_cpu", y, 1); - - hypre_StructVectorSetConstantValues(x, 5.0); - hypre_printf("hypre_StructVectorSetConstantValues() success!\n"); - - hypre_StructVectorPrint("after_gpu", x, 1); - - /* call axpy */ - /* my_hypre_StructAxpy(1.0, x, b); */ - - - - - - - - hypre_printf("DONE\n"); - return 0; -} - -HYPRE_Int -AddValuesVector( hypre_StructGrid *gridvector, - hypre_StructVector *zvector, - HYPRE_Int *period, - HYPRE_Real value ) -{ -/* #include "_hypre_struct_mv.h" */ - HYPRE_Int ierr = 0; - hypre_BoxArray *gridboxes; - HYPRE_Int ib; - hypre_IndexRef ilower; - hypre_IndexRef iupper; - hypre_Box *box; - HYPRE_Real *values; - HYPRE_Int volume,dim; -#if 0 //defined(HYPRE_USING_CUDA) - HYPRE_Int data_location = hypre_StructGridDataLocation(hypre_StructVectorGrid(zvector)); -#endif - - gridboxes = hypre_StructGridBoxes(gridvector); - dim = hypre_StructGridNDim(gridvector); - - ib=0; - hypre_ForBoxI(ib, gridboxes) - { - box = hypre_BoxArrayBox(gridboxes, ib); - volume = hypre_BoxVolume(box); -#if 0 //defined(HYPRE_USING_CUDA) - if (data_location != HYPRE_MEMORY_HOST) - { - values = hypre_CTAlloc(HYPRE_Real, volume,HYPRE_MEMORY_DEVICE); - } - else - { - values = hypre_CTAlloc(HYPRE_Real, volume,HYPRE_MEMORY_HOST); - } -#else - values = hypre_CTAlloc(HYPRE_Real, volume,HYPRE_MEMORY_DEVICE); -#endif - /*----------------------------------------------------------- - * For periodic b.c. in all directions, need rhs to satisfy - * compatibility condition. Achieved by setting a source and - * sink of equal strength. All other problems have rhs = 1. - *-----------------------------------------------------------*/ - -#define DEVICE_VAR is_device_ptr(values) - if ((dim == 2 && period[0] != 0 && period[1] != 0) || - (dim == 3 && period[0] != 0 && period[1] != 0 && period[2] != 0)) - { - hypre_LoopBegin(volume,i) - { - values[i] = 0.0; - values[0] = value; - values[volume - 1] = -value; - - } - hypre_LoopEnd() - } - else - { - hypre_LoopBegin(volume,i) - { - values[i] = value; - } - hypre_LoopEnd() - } -#undef DEVICE_VAR - - ilower = hypre_BoxIMin(box); - iupper = hypre_BoxIMax(box); - - HYPRE_StructVectorSetBoxValues(zvector, ilower, iupper, values); - -#if 0 //defined(HYPRE_USING_CUDA) - if (data_location != HYPRE_MEMORY_HOST) - { - hypre_TFree(values,HYPRE_MEMORY_DEVICE); - } - else - { - hypre_TFree(values,HYPRE_MEMORY_HOST); - } -#else - hypre_TFree(values,HYPRE_MEMORY_DEVICE); -#endif - } - - return ierr; -} diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index 0437d65175..3ff5aab39b 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -973,11 +973,9 @@ hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i) } }; - /* WM: having trouble with getting the device on frank, so temporarily just passing the default selector */ sycl::device syclDev = data->device; sycl::context syclctxt = sycl::context(syclDev, sycl_asynchandler); stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); - /* stream = new sycl::queue(sycl::default_selector{}, sycl::property_list{sycl::property::queue::in_order{}}); */ data->streams[i] = stream; } #endif @@ -1234,7 +1232,7 @@ hypre_DeviceDataCreate() hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST); #if defined(HYPRE_USING_SYCL) - /* WM: does the default selector get a GPU if available? */ + /* WM: does the default selector get a GPU if available? Having trouble with getting the device on frank, so temporarily just passing the default selector */ hypre_DeviceDataDevice(data) = sycl::device(sycl::default_selector{}); #else hypre_DeviceDataDevice(data) = 0; @@ -1491,8 +1489,7 @@ hypre_bind_device( HYPRE_Int myid, hypre_MPI_Comm_free(&node_comm); /* get number of devices on this node */ - /* WM: doesn't work on frank... commenting out */ - /* hypre_GetDeviceCount(&nDevices); */ + hypre_GetDeviceCount(&nDevices); nDevices = 1; /* set device */ diff --git a/src/utilities/general.c b/src/utilities/general.c index 2f332dcac8..8ec1e818e1 100644 --- a/src/utilities/general.c +++ b/src/utilities/general.c @@ -71,7 +71,7 @@ hypre_HandleDestroy(hypre_Handle *hypre_handle_) hypre_DeviceDataDestroy(hypre_HandleDeviceData(hypre_handle_)); #endif -// WM: in debug mode, hypre_TFree() checks the pointer location, which requires the +// In debug mode, hypre_TFree() checks the pointer location, which requires the // hypre_handle_'s compute queue if using sycl. But this was just destroyed above. #if defined(HYPRE_DEBUG) && defined(HYPRE_USING_SYCL) free(hypre_handle_); @@ -152,25 +152,25 @@ hypre_GetDeviceCount(hypre_int *device_count) #endif #if defined(HYPRE_USING_SYCL) - // WM: TODO - verify - sycl::platform platform(sycl::gpu_selector{}); - auto const& gpu_devices = platform.get_devices(); - for (int i = 0; i < gpu_devices.size(); i++) - { - if (gpu_devices[i].is_gpu()) - { - if(gpu_devices[i].get_info() > 0) - { - auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices( - sycl::info::partition_affinity_domain::numa); - (*device_count) += subDevicesDomainNuma.size(); - } - else - { - (*device_count)++; - } - } - } + /* WM: todo - doesn't work on frank... commenting out */ + /* sycl::platform platform(sycl::gpu_selector{}); */ + /* auto const& gpu_devices = platform.get_devices(); */ + /* for (int i = 0; i < gpu_devices.size(); i++) */ + /* { */ + /* if (gpu_devices[i].is_gpu()) */ + /* { */ + /* if(gpu_devices[i].get_info() > 0) */ + /* { */ + /* auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices( */ + /* sycl::info::partition_affinity_domain::numa); */ + /* (*device_count) += subDevicesDomainNuma.size(); */ + /* } */ + /* else */ + /* { */ + /* (*device_count)++; */ + /* } */ + /* } */ + /* } */ #endif return hypre_error_flag; diff --git a/src/utilities/memory.c b/src/utilities/memory.c index 37248e3033..a9941e1917 100644 --- a/src/utilities/memory.c +++ b/src/utilities/memory.c @@ -109,7 +109,6 @@ hypre_UnifiedMemset(void *ptr, HYPRE_Int value, size_t num) static inline void hypre_UnifiedMemPrefetch(void *ptr, size_t size, hypre_MemoryLocation location) { - /* hypre_printf("WM: debug - inside UnifiedMemPrefetch\n"); */ #if defined(HYPRE_USING_GPU) #ifdef HYPRE_DEBUG hypre_MemoryLocation tmp; @@ -252,7 +251,6 @@ hypre_DeviceMalloc(size_t size, HYPRE_Int zeroinit) static inline void * hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit) { - /* hypre_printf("WM: debug - inside UnifiedMalloc\n"); */ void *ptr = NULL; #if defined(HYPRE_USING_UMPIRE_UM) @@ -277,7 +275,6 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit) #if defined(HYPRE_USING_SYCL) HYPRE_SYCL_CALL( ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleComputeStream(hypre_handle()))) ); - /* hypre_printf("WM: debug - did the sycl shared allocation\n"); */ #endif #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */ @@ -285,7 +282,6 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit) /* prefecth to device */ if (ptr) { - /* hypre_printf("WM: debug - about to prefetch\n"); */ hypre_UnifiedMemPrefetch(ptr, size, hypre_MEMORY_DEVICE); } @@ -987,7 +983,6 @@ hypre_GetExecPolicy2(HYPRE_MemoryLocation location1, HYPRE_Int hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location) { - /* hypre_printf("WM: debug - inside GetPointerLocation\n"); */ HYPRE_Int ierr = 0; #if defined(HYPRE_USING_GPU) @@ -1090,7 +1085,6 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location) sycl::usm::alloc allocType; allocType = sycl::get_pointer_type(ptr, (hypre_HandleComputeStream(hypre_handle()))->get_context()); - /* hypre_printf("WM: debug - checking allocType\n"); */ if (allocType == sycl::usm::alloc::unknown) { *memory_location = hypre_MEMORY_HOST; @@ -1106,7 +1100,6 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location) else if (allocType == sycl::usm::alloc::shared) { *memory_location = hypre_MEMORY_UNIFIED; - /* hypre_printf("WM: debug - IS UNIFIED MEMORY\n"); */ } #endif //HYPRE_USING_SYCL From 4e54d486037b0e5ec00e452d0722ada7dcd2c796 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Tue, 5 Oct 2021 17:29:04 -0700 Subject: [PATCH 16/44] Added hypreLoopBegin/End --- src/struct_mv/_hypre_struct_mv.hpp | 38 ++++++++++++++++++++---------- src/struct_mv/boxloop_sycl.h | 38 ++++++++++++++++++++---------- 2 files changed, 52 insertions(+), 24 deletions(-) diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index e4824ec744..8c111ed729 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1189,8 +1189,8 @@ extern "C++" { template void -BoxLoopforall( LOOP_BODY loop_body, - HYPRE_Int length ) +BoxLoopforall( HYPRE_Int length, + LOOP_BODY loop_body) { /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ /* WM: TODO: uncomment above and remove below */ @@ -1383,7 +1383,7 @@ else \ { \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -1393,7 +1393,7 @@ else \ #define hypre_newBoxLoop1End(i1) \ } \ - }, hypre__tot); \ + }); \ } /* BoxLoop 2 */ @@ -1403,7 +1403,7 @@ else \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -1414,7 +1414,7 @@ else \ #define hypre_newBoxLoop2End(i1, i2) \ } \ - }, hypre__tot); \ + }); \ } /* BoxLoop 3 */ @@ -1426,7 +1426,7 @@ else \ hypre_BoxLoopDataDeclareK(1, ndim,loop_size, dbox1, start1, stride1); \ hypre_BoxLoopDataDeclareK(2, ndim,loop_size, dbox2, start2, stride2); \ hypre_BoxLoopDataDeclareK(3, ndim,loop_size, dbox3, start3, stride3); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -1438,7 +1438,7 @@ else \ #define hypre_newBoxLoop3End(i1, i2, i3) \ } \ - }, hypre__tot); \ + }); \ } /* BoxLoop 4 */ @@ -1452,7 +1452,7 @@ else \ hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ hypre_BoxLoopDataDeclareK(3, ndim, loop_size, dbox3, start3, stride3); \ hypre_BoxLoopDataDeclareK(4, ndim, loop_size, dbox4, start4, stride4); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -1465,7 +1465,7 @@ else \ #define hypre_newBoxLoop4End(i1, i2, i3, i4) \ } \ - }, hypre__tot); \ + }); \ } @@ -1475,7 +1475,7 @@ else \ { \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -1489,7 +1489,7 @@ else \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1); \ hypre_BasicBoxLoopDataDeclareK(2, ndim, loop_size, stride2); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -1544,6 +1544,20 @@ else \ }, hypre__tot, sum_buf); \ } +/* Plain parallel_for loop */ +#define hypre_LoopBegin(size, idx) \ +{ \ + BoxLoopforall(size, [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < size) \ + { \ + +#define hypre_LoopEnd() \ + } \ + }); \ +} + /********************************************************************* * renamings diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index 311c235567..dd8f910562 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -37,8 +37,8 @@ extern "C++" { template void -BoxLoopforall( LOOP_BODY loop_body, - HYPRE_Int length ) +BoxLoopforall( HYPRE_Int length, + LOOP_BODY loop_body) { /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ /* WM: TODO: uncomment above and remove below */ @@ -231,7 +231,7 @@ else \ { \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -241,7 +241,7 @@ else \ #define hypre_newBoxLoop1End(i1) \ } \ - }, hypre__tot); \ + }); \ } /* BoxLoop 2 */ @@ -251,7 +251,7 @@ else \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -262,7 +262,7 @@ else \ #define hypre_newBoxLoop2End(i1, i2) \ } \ - }, hypre__tot); \ + }); \ } /* BoxLoop 3 */ @@ -274,7 +274,7 @@ else \ hypre_BoxLoopDataDeclareK(1, ndim,loop_size, dbox1, start1, stride1); \ hypre_BoxLoopDataDeclareK(2, ndim,loop_size, dbox2, start2, stride2); \ hypre_BoxLoopDataDeclareK(3, ndim,loop_size, dbox3, start3, stride3); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -286,7 +286,7 @@ else \ #define hypre_newBoxLoop3End(i1, i2, i3) \ } \ - }, hypre__tot); \ + }); \ } /* BoxLoop 4 */ @@ -300,7 +300,7 @@ else \ hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ hypre_BoxLoopDataDeclareK(3, ndim, loop_size, dbox3, start3, stride3); \ hypre_BoxLoopDataDeclareK(4, ndim, loop_size, dbox4, start4, stride4); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -313,7 +313,7 @@ else \ #define hypre_newBoxLoop4End(i1, i2, i3, i4) \ } \ - }, hypre__tot); \ + }); \ } @@ -323,7 +323,7 @@ else \ { \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -337,7 +337,7 @@ else \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1); \ hypre_BasicBoxLoopDataDeclareK(2, ndim, loop_size, stride2); \ - BoxLoopforall( [=] (sycl::nd_item<1> item) \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -392,6 +392,20 @@ else \ }, hypre__tot, sum_buf); \ } +/* Plain parallel_for loop */ +#define hypre_LoopBegin(size, idx) \ +{ \ + BoxLoopforall(size, [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < size) \ + { \ + +#define hypre_LoopEnd() \ + } \ + }); \ +} + /********************************************************************* * renamings From a127622baec600b5b2545188211f6afffa35f793 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Mon, 18 Oct 2021 18:17:42 -0700 Subject: [PATCH 17/44] Bug fix --- src/struct_mv/_hypre_struct_mv.hpp | 5 +++-- src/struct_mv/boxloop_sycl.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index 8c111ed729..41c5eb4f51 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1547,10 +1547,11 @@ else \ /* Plain parallel_for loop */ #define hypre_LoopBegin(size, idx) \ { \ - BoxLoopforall(size, [=] (sycl::nd_item<1> item) \ + HYPRE_Int hypre__tot = size; \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ - if (idx < size) \ + if (idx < hypre__tot) \ { \ #define hypre_LoopEnd() \ diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index dd8f910562..a8812b70a5 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -395,10 +395,11 @@ else \ /* Plain parallel_for loop */ #define hypre_LoopBegin(size, idx) \ { \ - BoxLoopforall(size, [=] (sycl::nd_item<1> item) \ + HYPRE_Int hypre__tot = size; \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ - if (idx < size) \ + if (idx < hypre__tot) \ { \ #define hypre_LoopEnd() \ From 94a269d5a2699cad6bc8ddb6fe7316fa7f150216 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Mon, 25 Oct 2021 22:37:43 +0000 Subject: [PATCH 18/44] Fix configuration options for non-unified memory --- src/configure | 17 ++++++++++++++++- src/utilities/device_utils.c | 5 ++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/configure b/src/configure index bb48dbdf9b..c7f941f40e 100755 --- a/src/configure +++ b/src/configure @@ -4910,6 +4910,21 @@ then as_fn_error $? "--with-hip and --with-device-openmp are mutually exclusive" "$LINENO" 5 fi +if test "x$hypre_using_cuda" = "xyes" && test "x$hypre_using_sycl" = "xyes" +then + as_fn_error $? "--with-cuda and --with-sycl are mutually exclusive" "$LINENO" 5 +fi + +if test "x$hypre_using_hip" = "xyes" && test "x$hypre_using_sycl" = "xyes" +then + as_fn_error $? "--with-hip and --with-sycl are mutually exclusive" "$LINENO" 5 +fi + +if test "x$hypre_using_device_openmp" = "xyes" && test "x$hypre_using_sycl" = "xyes" +then + as_fn_error $? "--with-device-openmp and --with-sycl are mutually exclusive" "$LINENO" 5 +fi + if test "$hypre_user_chose_cudacompilers" = "no" then @@ -9315,7 +9330,7 @@ then $as_echo "#define HYPRE_USING_UNIFIED_MEMORY 1" >>confdefs.h else - if test "x$hypre_using_cuda" = "xyes" || test "x$hypre_using_device_openmp" = "xyes" || test "x$hypre_using_hip" = "xyes" + if test "x$hypre_using_cuda" = "xyes" || test "x$hypre_using_device_openmp" = "xyes" || test "x$hypre_using_hip" = "xyes" || test "x$hypre_using_sycl" = "xyes" then $as_echo "#define HYPRE_USING_DEVICE_MEMORY 1" >>confdefs.h diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index 3ff5aab39b..e803495e9d 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -9,12 +9,11 @@ #include "_hypre_utilities.hpp" #if defined(HYPRE_USING_SYCL) -// WM: TODO: verify sycl::range<1> hypre_GetDefaultCUDABlockDimension() { // 256 - max work group size for Gen9 - // 512 - max work group size for ATS - sycl::range<1> wgDim(64); + // 1024 - max work group size for ATS + sycl::range<1> wgDim(1024); return wgDim; } From 39fbd2db4b34157481f357a953e7bdfaccd7f77a Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Tue, 26 Oct 2021 16:29:02 +0000 Subject: [PATCH 19/44] Update oneapi reduction --- src/struct_mv/_hypre_struct_mv.hpp | 2 +- src/struct_mv/boxloop_sycl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index 41c5eb4f51..e48daf8bf2 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1253,7 +1253,7 @@ ReductionBoxLoopforall( LOOP_BODY loop_body, hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write); - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), loop_body); + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ext::oneapi::reduction(sum_acc, std::plus<>()), loop_body); }).wait_and_throw(); } } diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index a8812b70a5..02c90e6331 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -101,7 +101,7 @@ ReductionBoxLoopforall( LOOP_BODY loop_body, hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write); - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), loop_body); + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ext::oneapi::reduction(sum_acc, std::plus<>()), loop_body); }).wait_and_throw(); } } From 193ee25d224fb67ae5d94a812025e46ca0d61e2d Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Tue, 26 Oct 2021 17:12:34 +0000 Subject: [PATCH 20/44] Bug fix in parallel --- src/struct_mv/struct_communication.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/struct_mv/struct_communication.c b/src/struct_mv/struct_communication.c index 81be1bb7ba..cb321ee9f7 100644 --- a/src/struct_mv/struct_communication.c +++ b/src/struct_mv/struct_communication.c @@ -846,7 +846,7 @@ hypre_InitializeCommunication( hypre_CommPkg *comm_pkg, #if defined(HYPRE_USING_GPU) #if defined(HYPRE_USING_RAJA) || defined(HYPRE_USING_KOKKOS) alloc_dev_buffer = 1; -#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) alloc_dev_buffer = (hypre_HandleStructExecPolicy(hypre_handle()) == HYPRE_EXEC_DEVICE); #elif defined(HYPRE_USING_DEVICE_OPENMP) alloc_dev_buffer = hypre__global_offload; From 9166c167718cff2006206ddf7a353282da9c9ea7 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Tue, 26 Oct 2021 18:19:07 +0000 Subject: [PATCH 21/44] Additional macro fixes and implementation of redblack relax --- src/struct_ls/red_black_gs.h | 59 ++++++++++++++++++++++++++++ src/struct_mv/_hypre_struct_mv.h | 2 +- src/struct_mv/box.h | 2 +- src/struct_mv/struct_communication.c | 2 +- 4 files changed, 62 insertions(+), 3 deletions(-) diff --git a/src/struct_ls/red_black_gs.h b/src/struct_ls/red_black_gs.h index afd8eabf7b..f1f95d864f 100644 --- a/src/struct_ls/red_black_gs.h +++ b/src/struct_ls/red_black_gs.h @@ -209,6 +209,65 @@ typedef struct }); \ } +#elif defined(HYPRE_USING_SYCL) + +#define hypre_RedBlackLoopInit() +#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack, \ + Astart,Ani,Anj,Ai, \ + bstart,bni,bnj,bi, \ + xstart,xni,xnj,xi) \ +{ \ + HYPRE_Int hypre__tot = nk*nj*((ni+1)/2); \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx_local = idx; \ + HYPRE_Int ii,jj,kk,Ai,bi,xi; \ + HYPRE_Int local_ii; \ + kk = idx_local % nk; \ + idx_local = idx_local / nk; \ + jj = idx_local % nj; \ + idx_local = idx_local / nj; \ + local_ii = (kk + jj + redblack) % 2; \ + ii = 2*idx_local + local_ii; \ + if (ii < ni) \ + { \ + Ai = Astart + kk*Anj*Ani + jj*Ani + ii; \ + bi = bstart + kk*bnj*bni + jj*bni + ii; \ + xi = xstart + kk*xnj*xni + jj*xni + ii; \ + +#define hypre_RedBlackLoopEnd() \ + } \ + }); \ +} + +#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack, \ + bstart,bni,bnj,bi, \ + xstart,xni,xnj,xi) \ +{ \ + HYPRE_Int hypre__tot = nk*nj*((ni+1)/2); \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + HYPRE_Int idx_local = idx; \ + HYPRE_Int ii,jj,kk,bi,xi; \ + HYPRE_Int local_ii; \ + kk = idx_local % nk; \ + idx_local = idx_local / nk; \ + jj = idx_local % nj; \ + idx_local = idx_local / nj; \ + local_ii = (kk + jj + redblack) % 2; \ + ii = 2*idx_local + local_ii; \ + if (ii < ni) \ + { \ + bi = bstart + kk*bnj*bni + jj*bni + ii; \ + xi = xstart + kk*xnj*xni + jj*xni + ii; \ + +#define hypre_RedBlackConstantcoefLoopEnd() \ + } \ + }); \ +} + #elif defined(HYPRE_USING_DEVICE_OPENMP) /* BEGIN OF OMP 4.5 */ diff --git a/src/struct_mv/_hypre_struct_mv.h b/src/struct_mv/_hypre_struct_mv.h index 70dbdf9f41..8567df0cf6 100644 --- a/src/struct_mv/_hypre_struct_mv.h +++ b/src/struct_mv/_hypre_struct_mv.h @@ -35,7 +35,7 @@ extern "C" { #define HYPRE_MAXDIM 3 #endif -#if defined(HYPRE_USING_RAJA) || defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) || defined(HYPRE_USING_HIP) +#if defined(HYPRE_USING_RAJA) || defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) #define hypre_BoxLoopSetOneBlock() #else #define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock diff --git a/src/struct_mv/box.h b/src/struct_mv/box.h index eae0061331..8d2ad2db9a 100644 --- a/src/struct_mv/box.h +++ b/src/struct_mv/box.h @@ -18,7 +18,7 @@ #define HYPRE_MAXDIM 3 #endif -#if defined(HYPRE_USING_RAJA) || defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) || defined(HYPRE_USING_HIP) +#if defined(HYPRE_USING_RAJA) || defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) #define hypre_BoxLoopSetOneBlock() #else #define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock diff --git a/src/struct_mv/struct_communication.c b/src/struct_mv/struct_communication.c index cb321ee9f7..d80e96620b 100644 --- a/src/struct_mv/struct_communication.c +++ b/src/struct_mv/struct_communication.c @@ -1218,7 +1218,7 @@ hypre_FinalizeCommunication( hypre_CommHandle *comm_handle ) #if defined(HYPRE_USING_GPU) #if defined(HYPRE_USING_RAJA) || defined(HYPRE_USING_KOKKOS) alloc_dev_buffer = 1; -#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL) alloc_dev_buffer = (hypre_HandleStructExecPolicy(hypre_handle()) == HYPRE_EXEC_DEVICE); #elif defined(HYPRE_USING_DEVICE_OPENMP) alloc_dev_buffer = hypre__global_offload; From 4fca1be4c19d0c22ea85f47683959f53523a6d2b Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Wed, 27 Oct 2021 21:40:26 +0000 Subject: [PATCH 22/44] Automatic selection of block dimension --- src/utilities/_hypre_utilities.h | 1 + src/utilities/_hypre_utilities.hpp | 2 ++ src/utilities/device_utils.c | 11 +++++------ src/utilities/device_utils.h | 2 ++ src/utilities/handle.h | 1 + 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index 7faf7f9a1d..7505d6277e 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -1278,6 +1278,7 @@ typedef struct #define hypre_HandleCubDevAllocator(hypre_handle) hypre_DeviceDataCubDevAllocator(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleCubUvmAllocator(hypre_handle) hypre_DeviceDataCubUvmAllocator(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleDevice(hypre_handle) hypre_DeviceDataDevice(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleDeviceMaxWorkGroupSize(hypre_handle) hypre_DeviceDataDeviceMaxWorkGroupSize(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleComputeStreamNum(hypre_handle) hypre_DeviceDataComputeStreamNum(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleReduceBuffer(hypre_handle) hypre_DeviceDataReduceBuffer(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleStructCommRecvBuffer(hypre_handle) hypre_DeviceDataStructCommRecvBuffer(hypre_HandleDeviceData(hypre_handle)) diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 6fe8451f0f..4f062c7c06 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -290,6 +290,7 @@ struct hypre_DeviceData #endif #if defined(HYPRE_USING_SYCL) sycl::device device; + HYPRE_Int device_max_work_group_size; #else HYPRE_Int device; #endif @@ -321,6 +322,7 @@ struct hypre_DeviceData #define hypre_DeviceDataCubDevAllocator(data) ((data) -> cub_dev_allocator) #define hypre_DeviceDataCubUvmAllocator(data) ((data) -> cub_uvm_allocator) #define hypre_DeviceDataDevice(data) ((data) -> device) +#define hypre_DeviceDataDeviceMaxWorkGroupSize(data) ((data) -> device_max_work_group_size) #define hypre_DeviceDataComputeStreamNum(data) ((data) -> compute_stream_num) #define hypre_DeviceDataReduceBuffer(data) ((data) -> reduce_buffer) #define hypre_DeviceDataStructCommRecvBuffer(data) ((data) -> struct_comm_recv_buffer) diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index e803495e9d..a845fe8303 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -11,10 +11,8 @@ #if defined(HYPRE_USING_SYCL) sycl::range<1> hypre_GetDefaultCUDABlockDimension() { - // 256 - max work group size for Gen9 - // 1024 - max work group size for ATS - sycl::range<1> wgDim(1024); - return wgDim; + sycl::range<1> wgDim(hypre_HandleDeviceMaxWorkGroupSize(hypre_handle())); + return wgDim; } // WM: TODO: verify @@ -967,7 +965,7 @@ hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i) catch (sycl::exception const& ex) { std::cout << "Caught asynchronous SYCL exception:" << std::endl - << ex.what() << ", OpenCL code: " << ex.get_cl_code() << std::endl; + << ex.what() << ", OpenCL code: " << ex.code() << std::endl; } } }; @@ -1232,7 +1230,8 @@ hypre_DeviceDataCreate() #if defined(HYPRE_USING_SYCL) /* WM: does the default selector get a GPU if available? Having trouble with getting the device on frank, so temporarily just passing the default selector */ - hypre_DeviceDataDevice(data) = sycl::device(sycl::default_selector{}); + hypre_DeviceDataDevice(data) = sycl::device(sycl::default_selector{}); + hypre_DeviceDataDeviceMaxWorkGroupSize(data) = hypre_DeviceDataDevice(data).get_info(); #else hypre_DeviceDataDevice(data) = 0; #endif diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index a442f2229f..7123aefaaf 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -233,6 +233,7 @@ struct hypre_DeviceData #endif #if defined(HYPRE_USING_SYCL) sycl::device device; + HYPRE_Int device_max_work_group_size; #else HYPRE_Int device; #endif @@ -264,6 +265,7 @@ struct hypre_DeviceData #define hypre_DeviceDataCubDevAllocator(data) ((data) -> cub_dev_allocator) #define hypre_DeviceDataCubUvmAllocator(data) ((data) -> cub_uvm_allocator) #define hypre_DeviceDataDevice(data) ((data) -> device) +#define hypre_DeviceDataDeviceMaxWorkGroupSize(data) ((data) -> device_max_work_group_size) #define hypre_DeviceDataComputeStreamNum(data) ((data) -> compute_stream_num) #define hypre_DeviceDataReduceBuffer(data) ((data) -> reduce_buffer) #define hypre_DeviceDataStructCommRecvBuffer(data) ((data) -> struct_comm_recv_buffer) diff --git a/src/utilities/handle.h b/src/utilities/handle.h index 8e5979c7a2..2e3dc3198a 100644 --- a/src/utilities/handle.h +++ b/src/utilities/handle.h @@ -67,6 +67,7 @@ typedef struct #define hypre_HandleCubDevAllocator(hypre_handle) hypre_DeviceDataCubDevAllocator(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleCubUvmAllocator(hypre_handle) hypre_DeviceDataCubUvmAllocator(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleDevice(hypre_handle) hypre_DeviceDataDevice(hypre_HandleDeviceData(hypre_handle)) +#define hypre_HandleDeviceMaxWorkGroupSize(hypre_handle) hypre_DeviceDataDeviceMaxWorkGroupSize(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleComputeStreamNum(hypre_handle) hypre_DeviceDataComputeStreamNum(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleReduceBuffer(hypre_handle) hypre_DeviceDataReduceBuffer(hypre_HandleDeviceData(hypre_handle)) #define hypre_HandleStructCommRecvBuffer(hypre_handle) hypre_DeviceDataStructCommRecvBuffer(hypre_HandleDeviceData(hypre_handle)) From a6383e8cff5bb660eb0df5782e04de4a6c7babc6 Mon Sep 17 00:00:00 2001 From: Ruipeng Li Date: Wed, 27 Oct 2021 16:05:04 -0700 Subject: [PATCH 23/44] zboxloop --- src/test/zboxloop.c | 66 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/src/test/zboxloop.c b/src/test/zboxloop.c index f506d7f432..2e4830fe0f 100644 --- a/src/test/zboxloop.c +++ b/src/test/zboxloop.c @@ -20,8 +20,6 @@ * Test driver to time new boxloops and compare to the old ones *--------------------------------------------------------------------------*/ -#define DEVICE_VAR - hypre_int main( hypre_int argc, char *argv[] ) @@ -39,6 +37,7 @@ main( hypre_int argc, //HYPRE_Int xi1, xi2, xi3, xi4; HYPRE_Int xi1; HYPRE_Real *xp1, *xp2, *xp3, *xp4; + HYPRE_Real *d_xp1, *d_xp2, *d_xp3, *d_xp4; hypre_Index loop_size, start, unit_stride, index; /*----------------------------------------------------------- @@ -51,6 +50,8 @@ main( hypre_int argc, hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs ); hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid ); + HYPRE_Init(); + /*----------------------------------------------------------- * Set defaults *-----------------------------------------------------------*/ @@ -65,6 +66,8 @@ main( hypre_int argc, Q = 1; R = 1; + reps = -1; + /*----------------------------------------------------------- * Parse command line *-----------------------------------------------------------*/ @@ -92,6 +95,11 @@ main( hypre_int argc, arg_index++; dim = atoi(argv[arg_index++]); } + else if ( strcmp(argv[arg_index], "-reps") == 0 ) + { + arg_index++; + reps = atoi(argv[arg_index++]); + } else if ( strcmp(argv[arg_index], "-help") == 0 ) { print_usage = 1; @@ -162,12 +170,20 @@ main( hypre_int argc, hypre_CopyBox(x1_data_box, x4_data_box); size = (nx+2)*(ny+2)*(nz+2); - xp1 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_HOST); - xp2 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_HOST); - xp3 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_HOST); - xp4 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_HOST); + xp1 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_HOST); + xp2 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_HOST); + xp3 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_HOST); + xp4 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_HOST); + + d_xp1 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_DEVICE); + d_xp2 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_DEVICE); + d_xp3 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_DEVICE); + d_xp4 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_DEVICE); - reps = 1000000000/(nx*ny*nz+1000); + if (reps < 0) + { + reps = 1000000000/(nx*ny*nz+1000); + } /*----------------------------------------------------------- * Print driver parameters @@ -230,7 +246,7 @@ main( hypre_int argc, hypre_MPI_Barrier(hypre_MPI_COMM_WORLD); /*----------------------------------------------------------- - * Time old boxloops + * Time old boxloops [Device] *-----------------------------------------------------------*/ /* Time BoxLoop0 */ @@ -239,12 +255,14 @@ main( hypre_int argc, for (rep = 0; rep < reps; rep++) { xi1 = 0; +#define DEVICE_VAR is_device_ptr(d_xp1) hypre_BoxLoop0Begin(3, loop_size); { - xp1[xi1] += xp1[xi1]; + d_xp1[xi1] += d_xp1[xi1]; //xi1++; } hypre_BoxLoop0End(); +#undef DEVICE_VAR } hypre_EndTiming(time_index); @@ -253,12 +271,14 @@ main( hypre_int argc, hypre_BeginTiming(time_index); for (rep = 0; rep < reps; rep++) { +#define DEVICE_VAR is_device_ptr(d_xp1) hypre_BoxLoop1Begin(3, loop_size, x1_data_box, start, unit_stride, xi1); { - xp1[xi1] += xp1[xi1]; + d_xp1[xi1] += d_xp1[xi1]; } hypre_BoxLoop1End(xi1); +#undef DEVICE_VAR } hypre_EndTiming(time_index); @@ -267,13 +287,15 @@ main( hypre_int argc, hypre_BeginTiming(time_index); for (rep = 0; rep < reps; rep++) { +#define DEVICE_VAR is_device_ptr(d_xp1,d_xp2) hypre_BoxLoop2Begin(3, loop_size, x1_data_box, start, unit_stride, xi1, x2_data_box, start, unit_stride, xi2); { - xp1[xi1] += xp1[xi1] + xp2[xi2]; + d_xp1[xi1] += d_xp1[xi1] + d_xp2[xi2]; } hypre_BoxLoop2End(xi1, xi2); +#undef DEVICE_VAR } hypre_EndTiming(time_index); @@ -282,14 +304,16 @@ main( hypre_int argc, hypre_BeginTiming(time_index); for (rep = 0; rep < reps; rep++) { +#define DEVICE_VAR is_device_ptr(d_xp1,d_xp2,d_xp3) hypre_BoxLoop3Begin(3, loop_size, x1_data_box, start, unit_stride, xi1, x2_data_box, start, unit_stride, xi2, x3_data_box, start, unit_stride, xi3); { - xp1[xi1] += xp1[xi1] + xp2[xi2] + xp3[xi3]; + d_xp1[xi1] += d_xp1[xi1] + d_xp2[xi2] + d_xp3[xi3]; } hypre_BoxLoop3End(xi1, xi2, xi3); +#undef DEVICE_VAR } hypre_EndTiming(time_index); @@ -298,24 +322,26 @@ main( hypre_int argc, hypre_BeginTiming(time_index); for (rep = 0; rep < reps; rep++) { +#define DEVICE_VAR is_device_ptr(d_xp1,d_xp2,d_xp3,d_xp4) hypre_BoxLoop4Begin(3, loop_size, x1_data_box, start, unit_stride, xi1, x2_data_box, start, unit_stride, xi2, x3_data_box, start, unit_stride, xi3, x4_data_box, start, unit_stride, xi4); { - xp1[xi1] += xp1[xi1] + xp2[xi2] + xp3[xi3] + xp4[xi4]; + d_xp1[xi1] += d_xp1[xi1] + d_xp2[xi2] + d_xp3[xi3] + d_xp4[xi4]; } hypre_BoxLoop4End(xi1, xi2, xi3, xi4); +#undef DEVICE_VAR } hypre_EndTiming(time_index); - hypre_PrintTiming("Old BoxLoop times", hypre_MPI_COMM_WORLD); + hypre_PrintTiming("Old BoxLoop times [DEVICE]", hypre_MPI_COMM_WORLD); hypre_FinalizeTiming(time_index); hypre_ClearTiming(); /*----------------------------------------------------------- - * Time new boxloops + * Time new boxloops [Host] *-----------------------------------------------------------*/ /* Time BoxLoop0 */ @@ -415,7 +441,7 @@ main( hypre_int argc, } hypre_EndTiming(time_index); - hypre_PrintTiming("New BoxLoop times", hypre_MPI_COMM_WORLD); + hypre_PrintTiming("New BoxLoop times [HOST]", hypre_MPI_COMM_WORLD); hypre_FinalizeTiming(time_index); hypre_ClearTiming(); @@ -427,11 +453,19 @@ main( hypre_int argc, hypre_BoxDestroy(x2_data_box); hypre_BoxDestroy(x3_data_box); hypre_BoxDestroy(x4_data_box); + hypre_TFree(xp1, HYPRE_MEMORY_HOST); hypre_TFree(xp2, HYPRE_MEMORY_HOST); hypre_TFree(xp3, HYPRE_MEMORY_HOST); hypre_TFree(xp4, HYPRE_MEMORY_HOST); + hypre_TFree(d_xp1, HYPRE_MEMORY_DEVICE); + hypre_TFree(d_xp2, HYPRE_MEMORY_DEVICE); + hypre_TFree(d_xp3, HYPRE_MEMORY_DEVICE); + hypre_TFree(d_xp4, HYPRE_MEMORY_DEVICE); + + HYPRE_Finalize(); + /* Finalize MPI */ hypre_MPI_Finalize(); From 4ddcc4a27e62dc86f9c1484be020c045f833bc0d Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Fri, 29 Oct 2021 17:23:41 +0000 Subject: [PATCH 24/44] Fixes for compiler update on jlse --- src/test/Makefile | 1 - src/utilities/device_utils.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/test/Makefile b/src/test/Makefile index 975e702290..8f5cedba35 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -65,7 +65,6 @@ LFLAGS =\ HYPRE_DRIVERS =\ ij.c\ - simple.c\ ij_assembly.c\ sstruct.c\ struct.c\ diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index a845fe8303..8dd1092508 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -965,7 +965,7 @@ hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i) catch (sycl::exception const& ex) { std::cout << "Caught asynchronous SYCL exception:" << std::endl - << ex.what() << ", OpenCL code: " << ex.code() << std::endl; + << ex.what() << ", OpenCL code: " << ex.get_cl_code() << std::endl; } } }; From 345b0d04a61d9f86faf32bb6e29b406ba48af9d2 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Fri, 29 Oct 2021 20:29:13 +0000 Subject: [PATCH 25/44] Renamings --- src/IJ_mv/IJMatrix_parcsr_device.c | 8 +-- src/IJ_mv/IJVector_parcsr_device.c | 4 +- src/parcsr_ls/ads.c | 20 +++---- src/parcsr_ls/ame.c | 4 +- src/parcsr_ls/ams.c | 72 ++++++++++++------------- src/parcsr_ls/par_2s_interp_device.c | 14 ++--- src/parcsr_ls/par_coarsen_device.c | 8 +-- src/parcsr_ls/par_gauss_elim.c | 2 +- src/parcsr_ls/par_indepset_device.c | 6 +-- src/parcsr_ls/par_interp_device.c | 8 +-- src/parcsr_ls/par_interp_trunc_device.c | 4 +- src/parcsr_ls/par_lr_interp_device.c | 20 +++---- src/parcsr_ls/par_lr_restr_device.c | 4 +- src/parcsr_ls/par_relax_more_device.c | 4 +- src/parcsr_ls/par_strength_device.c | 4 +- src/parcsr_mv/par_csr_matop_device.c | 14 ++--- src/seq_mv/csr_matop_device.c | 28 +++++----- src/seq_mv/csr_spgemm_device_util.c | 6 +-- src/struct_mv/_hypre_struct_mv.hpp | 16 +++--- src/struct_mv/boxloop_cuda.h | 8 +-- src/struct_mv/boxloop_sycl.h | 8 +-- src/utilities/_hypre_utilities.hpp | 8 +-- src/utilities/device_utils.c | 45 ++++++++-------- src/utilities/device_utils.h | 8 +-- 24 files changed, 161 insertions(+), 162 deletions(-) diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index 157701dcee..1760f3f0db 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -153,8 +153,8 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, HYPRE_Int *indicator = hypre_CTAlloc(HYPRE_Int, len, HYPRE_MEMORY_DEVICE); hypreDevice_CsrRowPtrsToIndices_v2(nrows-1, len1, (HYPRE_Int *) row_indexes, indicator); /* mark unwanted elements as -1 */ - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(len1, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(len1, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator, (HYPRE_Int *) row_indexes, ncols, indicator ); auto new_end = HYPRE_THRUST_CALL( @@ -216,8 +216,8 @@ hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_Big HYPRE_Complex *A = hypre_TAlloc(HYPRE_Complex, N0, HYPRE_MEMORY_DEVICE); /* - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(N0, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(N0, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 ); */ diff --git a/src/IJ_mv/IJVector_parcsr_device.c b/src/IJ_mv/IJVector_parcsr_device.c index a57bd9362c..b9afa8c67b 100644 --- a/src/IJ_mv/IJVector_parcsr_device.c +++ b/src/IJ_mv/IJVector_parcsr_device.c @@ -231,8 +231,8 @@ hypre_IJVectorAssembleParDevice(hypre_IJVector *vector) hypre_IJVectorAssembleSortAndReduce1(nelms, stack_i, stack_sora, stack_data, &new_nnz, &new_i, &new_sora, &new_data); /* set/add to local vector */ - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(new_nnz, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(new_nnz, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i, vec_start, new_sora, hypre_VectorData(hypre_ParVectorLocalVector(par_vector)) ); diff --git a/src/parcsr_ls/ads.c b/src/parcsr_ls/ads.c index 63a2c0f32f..e8e87b9047 100644 --- a/src/parcsr_ls/ads.c +++ b/src/parcsr_ls/ads.c @@ -573,13 +573,13 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A, Pi_diag_I, 3 * _1 ); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(F2V_diag_nnz, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nnz, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, F2V_diag_nnz, 3, F2V_diag_J, Pi_diag_J ); - gDim = hypre_GetDefaultCUDAGridDimension(F2V_diag_nrows, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data, @@ -635,13 +635,13 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A, 3 * _1 ); } - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(F2V_offd_nnz, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nnz, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, F2V_offd_nnz, 3, F2V_offd_J, Pi_offd_J ); - gDim = hypre_GetDefaultCUDAGridDimension(F2V_offd_nrows, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data, @@ -843,8 +843,8 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A, F2V_diag_nnz, thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_J, Piy_diag_J, Piz_diag_J)) ); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(F2V_diag_nrows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data, @@ -923,8 +923,8 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A, F2V_offd_nnz, thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_J, Piy_offd_J, Piz_offd_J)) ); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(F2V_offd_nrows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data, diff --git a/src/parcsr_ls/ame.c b/src/parcsr_ls/ame.c index fd34f4e189..eea0c6f9ae 100644 --- a/src/parcsr_ls/ame.c +++ b/src/parcsr_ls/ame.c @@ -465,8 +465,8 @@ HYPRE_Int hypre_AMESetup(void *esolver) #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) if (exec == HYPRE_EXEC_DEVICE) { - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(nv, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim, nv, GtdI, GtdJ, GtdA, GtoI, GtoJ, GtoA, edge_bc, offd_edge_bc ); } diff --git a/src/parcsr_ls/ams.c b/src/parcsr_ls/ams.c index 7262ae0256..01fe07450d 100644 --- a/src/parcsr_ls/ams.c +++ b/src/parcsr_ls/ams.c @@ -190,8 +190,8 @@ HYPRE_Int hypre_ParVectorBlockSplit(hypre_ParVector *x, #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) if (exec == HYPRE_EXEC_DEVICE) { - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(size_ * dim, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim, size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data); } @@ -233,8 +233,8 @@ HYPRE_Int hypre_ParVectorBlockGather(hypre_ParVector *x, #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) if (exec == HYPRE_EXEC_DEVICE) { - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(size_ * dim, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim, size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data); } @@ -433,8 +433,8 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A) HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(A_offd); dim3 bDim, gDim; - bDim = hypre_GetDefaultCUDABlockDimension(); - gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim); + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim, nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd); @@ -761,8 +761,8 @@ HYPRE_Int hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix *A, HYPRE_Real d) HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); if (exec == HYPRE_EXEC_DEVICE) { - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_rows, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim, num_rows, A_diag_I, A_diag_J, A_diag_data, A_offd_I, num_cols_offd, d); } @@ -1536,13 +1536,13 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A, Pi_diag_I, dim * _1 ); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nnz, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, G_diag_nnz, dim, G_diag_J, Pi_diag_J ); - gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data, @@ -1601,13 +1601,13 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A, dim * _1 ); } - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nnz, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, G_offd_nnz, dim, G_offd_J, Pi_offd_J ); - gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data, @@ -1835,8 +1835,8 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, G_diag_nnz, thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_J, Piy_diag_J, Piz_diag_J)) ); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data, @@ -1901,8 +1901,8 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, G_diag_nnz, thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_J, Piy_diag_J)) ); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, NULL, @@ -1959,8 +1959,8 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, G_diag_nnz, Pix_diag_J ); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, NULL, NULL, @@ -2036,8 +2036,8 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, G_offd_nnz, thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_J, Piy_offd_J, Piz_offd_J)) ); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data, @@ -2118,8 +2118,8 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, G_offd_nnz, thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_J, Piy_offd_J)) ); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, NULL, @@ -2190,8 +2190,8 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, G_offd_nnz, Pix_offd_J ); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, NULL, NULL, @@ -2382,13 +2382,13 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A, GPi_diag_I, dim * _1 ); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nnz, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, G_diag_nnz, dim, G_diag_J, GPi_diag_J ); - gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data, @@ -2448,13 +2448,13 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A, dim * _1 ); } - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nnz, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, G_offd_nnz, dim, G_offd_J, GPi_offd_J ); - gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data, @@ -2679,8 +2679,8 @@ HYPRE_Int hypre_AMSSetup(void *solver, #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) if (exec == HYPRE_EXEC_DEVICE) { - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(nv, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim, nv, G0tdI, G0tdA, G0toI, G0toA, interior_nodes_data ); } @@ -3244,8 +3244,8 @@ HYPRE_Int hypre_AMSSetup(void *solver, #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) if (exec == HYPRE_EXEC_DEVICE) { - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(Gt_num_rows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(Gt_num_rows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim, Gt_num_rows, Gt_diag_I, Gt_diag_J, Gt_diag_data, Gt_offd_I, Gt_offd_data, Gx_data, Gy_data, Gz_data ); diff --git a/src/parcsr_ls/par_2s_interp_device.c b/src/parcsr_ls/par_2s_interp_device.c index eab19cdd7e..15a497a04b 100644 --- a/src/parcsr_ls/par_2s_interp_device.c +++ b/src/parcsr_ls/par_2s_interp_device.c @@ -89,8 +89,8 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix *A, /* weak row sum and diagonal, i.e., DF2F2 + Dgamma */ rsWA = hypre_TAlloc(HYPRE_Complex, A_nr_local, HYPRE_MEMORY_DEVICE); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(A_nr_local, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim); /* only for rows corresponding to F2 (notice flag == -1) */ HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, @@ -142,7 +142,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix *A, /* add to rsW those in AF2F that correspond to Dbeta == 0 * diagnoally scale As_F2F (from both sides) and replace the diagonal */ - gDim = hypre_GetDefaultCUDAGridDimension(AF2F_nr_local, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF, gDim, bDim, @@ -304,8 +304,8 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A, hypre_assert(AFC_nr_local == hypre_ParCSRMatrixNumRows(As_FF)); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(AFC_nr_local, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(AFC_nr_local, "warp", bDim); /* Generate D_lambda in the paper: D_beta + (row sum of AFF without diagonal elements / row_nnz) */ /* Generate D_tmp, i.e., D_mu / D_lambda */ @@ -364,7 +364,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A, /* weak row sum and diagonal, i.e., DFF + Dgamma */ rsWA = hypre_TAlloc(HYPRE_Complex, A_nr_local, HYPRE_MEMORY_DEVICE); - gDim = hypre_GetDefaultCUDAGridDimension(A_nr_local, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim); /* only for rows corresponding to F2 (notice flag == -1) */ HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, @@ -415,7 +415,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A, /* add to rsW those in AFF that correspond to lam == 0 * diagnoally scale As_F2F (from both sides) and replace the diagonal */ - gDim = hypre_GetDefaultCUDAGridDimension(AF2F_nr_local, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF, gDim, bDim, diff --git a/src/parcsr_ls/par_coarsen_device.c b/src/parcsr_ls/par_coarsen_device.c index 38764b619f..6c30741003 100644 --- a/src/parcsr_ls/par_coarsen_device.c +++ b/src/parcsr_ls/par_coarsen_device.c @@ -317,8 +317,8 @@ hypre_PMISCoarseningInitDevice( hypre_ParCSRMatrix *S, /* in */ HYPRE_Int num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg); dim3 bDim, gDim; - bDim = hypre_GetDefaultCUDABlockDimension(); - gDim = hypre_GetDefaultCUDAGridDimension(num_rows_diag, "thread", bDim); + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(num_rows_diag, "thread", bDim); hypre_ParCSRCommHandle *comm_handle; HYPRE_Int *new_end; @@ -484,8 +484,8 @@ hypre_PMISCoarseningUpdateCFDevice( hypre_ParCSRMatrix *S, /* in HYPRE_Int num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg); dim3 bDim, gDim; - bDim = hypre_GetDefaultCUDABlockDimension(); - gDim = hypre_GetDefaultCUDAGridDimension(graph_diag_size, "warp", bDim); + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF, gDim, bDim, diff --git a/src/parcsr_ls/par_gauss_elim.c b/src/parcsr_ls/par_gauss_elim.c index fa6f58ac19..2a8c9f6189 100644 --- a/src/parcsr_ls/par_gauss_elim.c +++ b/src/parcsr_ls/par_gauss_elim.c @@ -418,7 +418,7 @@ hypreCUDAKernel_dgemv(HYPRE_Int m, HYPRE_Int hypre_dgemv_device(HYPRE_Int m, HYPRE_Int n, HYPRE_Int lda, HYPRE_Real *a, HYPRE_Real *x, HYPRE_Real *y) { dim3 bDim(BLOCK_SIZE, 1, 1); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(m, "thread", bDim); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(m, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y ); diff --git a/src/parcsr_ls/par_indepset_device.c b/src/parcsr_ls/par_indepset_device.c index d031b6936b..bfebafebc1 100644 --- a/src/parcsr_ls/par_indepset_device.c +++ b/src/parcsr_ls/par_indepset_device.c @@ -167,8 +167,8 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix *S, /*------------------------------------------------------- * Remove nodes from the initial independent set *-------------------------------------------------------*/ - bDim = hypre_GetDefaultCUDABlockDimension(); - gDim = hypre_GetDefaultCUDAGridDimension(graph_diag_size, "warp", bDim); + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim, graph_diag_size, graph_diag, measure_diag, measure_offd, @@ -184,7 +184,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix *S, hypre_ParCSRCommHandleDestroy(comm_handle); /* adjust IS_marker_diag from the received */ - gDim = hypre_GetDefaultCUDAGridDimension(num_elmts_send, "thread", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(num_elmts_send, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim, IS_marker_diag, num_elmts_send, send_map_elmts, diff --git a/src/parcsr_ls/par_interp_device.c b/src/parcsr_ls/par_interp_device.c index 714f846674..a7e5476ffb 100644 --- a/src/parcsr_ls/par_interp_device.c +++ b/src/parcsr_ls/par_interp_device.c @@ -175,8 +175,8 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A, P_diag_i = hypre_TAlloc(HYPRE_Int, n_fine+1, memory_location); P_offd_i = hypre_TAlloc(HYPRE_Int, n_fine+1, memory_location); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(n_fine, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim); HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim, n_fine, S_diag_i, S_diag_j, S_offd_i, S_offd_j, @@ -1124,8 +1124,8 @@ hypre_BoomerAMGBuildInterpOnePntDevice( hypre_ParCSRMatrix *A, P_diag_j_temp = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE); P_offd_j_temp = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(n_fine, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim); HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim, n_fine, A_diag_i, A_strong_diag_j, A_diag_a, A_offd_i, A_strong_offd_j, diff --git a/src/parcsr_ls/par_interp_trunc_device.c b/src/parcsr_ls/par_interp_trunc_device.c index 30fc5147d2..4524f91f9e 100644 --- a/src/parcsr_ls/par_interp_trunc_device.c +++ b/src/parcsr_ls/par_interp_trunc_device.c @@ -156,8 +156,8 @@ hypre_BoomerAMGInterpTruncationDevice( hypre_ParCSRMatrix *P, HYPRE_Real trunc_f hypreDevice_CsrRowIndicesToPtrs_v2(nrows, nnz_P, P_i, P_rowptr); /* truncate P, unwanted entries are marked -1 in P_j */ - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim, nrows, trunc_factor, max_elmts, P_rowptr, P_j, P_a ); diff --git a/src/parcsr_ls/par_lr_interp_device.c b/src/parcsr_ls/par_lr_interp_device.c index 84e1ba4e21..43ac592e95 100644 --- a/src/parcsr_ls/par_lr_interp_device.c +++ b/src/parcsr_ls/par_lr_interp_device.c @@ -66,8 +66,8 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix *A, /* row sum of A-weak + Diag(A), i.e., (D_gamma + D_alpha) in the notes, only for F-pts */ rsWA = hypre_TAlloc(HYPRE_Complex, A_nr_of_rows, HYPRE_MEMORY_DEVICE); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(A_nr_of_rows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, @@ -109,7 +109,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix *A, /* 5. Form matrix ~{A_FF}, (return twAFF in AFF data structure ) */ /* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */ hypre_GpuProfilingPushRange("Compute interp matrix"); - gDim = hypre_GetDefaultCUDAGridDimension(W_nr_of_rows, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc, gDim, bDim, W_nr_of_rows, @@ -252,8 +252,8 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix *A, /* row sum of A-weak + Diag(A), i.e., (D_gamma + D_alpha) in the notes, only for F-pts */ rsWA = hypre_TAlloc(HYPRE_Complex, A_nr_of_rows, HYPRE_MEMORY_DEVICE); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(A_nr_of_rows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, @@ -329,7 +329,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix *A, AFF_diag_data_old ); hypre_GpuProfilingPushRange("Compute interp matrix"); - gDim = hypre_GetDefaultCUDAGridDimension(W_nr_of_rows, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_twiaff_w, gDim, bDim, W_nr_of_rows, @@ -477,8 +477,8 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A, /* row sum of A-weak + Diag(A), i.e., (D_gamma + D_FF) in the notes, only for F-pts */ rsWA = hypre_TAlloc(HYPRE_Complex, A_nr_of_rows, HYPRE_MEMORY_DEVICE); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(A_nr_of_rows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, @@ -522,7 +522,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A, dlam = hypre_TAlloc(HYPRE_Complex, W_nr_of_rows, HYPRE_MEMORY_DEVICE); dtmp = hypre_TAlloc(HYPRE_Complex, W_nr_of_rows, HYPRE_MEMORY_DEVICE); hypre_GpuProfilingPushRange("Compute D_tmp"); - gDim = hypre_GetDefaultCUDAGridDimension(W_nr_of_rows, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp, gDim, bDim, W_nr_of_rows, @@ -562,7 +562,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A, /* 5. Form matrix ~{A_FF}, (return twAFF in AFF data structure ) */ /* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */ hypre_GpuProfilingPushRange("Compute interp matrix"); - gDim = hypre_GetDefaultCUDAGridDimension(W_nr_of_rows, "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe, gDim, bDim, W_nr_of_rows, diff --git a/src/parcsr_ls/par_lr_restr_device.c b/src/parcsr_ls/par_lr_restr_device.c index ff5e6450a3..104ec87451 100644 --- a/src/parcsr_ls/par_lr_restr_device.c +++ b/src/parcsr_ls/par_lr_restr_device.c @@ -245,8 +245,8 @@ hypre_BoomerAMGBuildRestrNeumannAIRDevice( hypre_ParCSRMatrix *A, thrust::plus() ); /* assemble the diagonal part of R from Z */ - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(n_fine, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim); HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim, n_cpts, Fmap, Cmap, Z_diag_i, Z_diag_j, Z_diag_a, R_diag_i, R_diag_j, R_diag_a); diff --git a/src/parcsr_ls/par_relax_more_device.c b/src/parcsr_ls/par_relax_more_device.c index 6cf1769b62..657905f3d9 100644 --- a/src/parcsr_ls/par_relax_more_device.c +++ b/src/parcsr_ls/par_relax_more_device.c @@ -151,8 +151,8 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A, dim3 bDim, gDim; - bDim = hypre_GetDefaultCUDABlockDimension(); - gDim = hypre_GetDefaultCUDAGridDimension(A_num_rows, "warp", bDim); + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(A_num_rows, "warp", bDim); HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate, gDim, bDim, diff --git a/src/parcsr_ls/par_strength_device.c b/src/parcsr_ls/par_strength_device.c index 196ebd0051..a2ca43fc8e 100644 --- a/src/parcsr_ls/par_strength_device.c +++ b/src/parcsr_ls/par_strength_device.c @@ -134,8 +134,8 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix *A, } /* count the row nnz of S */ - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_variables, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_variables, "warp", bDim); if (abs_soc) { diff --git a/src/parcsr_mv/par_csr_matop_device.c b/src/parcsr_mv/par_csr_matop_device.c index c0d44f3d2e..251e28d3a6 100644 --- a/src/parcsr_mv/par_csr_matop_device.c +++ b/src/parcsr_mv/par_csr_matop_device.c @@ -617,8 +617,8 @@ hypre_ConcatDiagAndOffdDevice(hypre_ParCSRMatrix *A) hypre_CSRMatrixI(B) + hypre_CSRMatrixNumRows(B) + 1, hypre_CSRMatrixI(B) ); - const dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - const dim3 gDim = hypre_GetDefaultCUDAGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim); + const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + const dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, gDim, bDim, @@ -732,8 +732,8 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A, hypre_CSRMatrixI(B) + hypre_ParCSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) ); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(hypre_ParCSRMatrixNumRows(A), "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_ParCSRMatrixNumRows(A), "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, gDim, bDim, @@ -761,7 +761,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A, hypre_CSRMatrixI(B) + hypre_ParCSRMatrixNumRows(A) + 1, thrust::plus() ); - gDim = hypre_GetDefaultCUDAGridDimension(hypre_CSRMatrixNumRows(E), "warp", bDim); + gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(E), "warp", bDim); hypre_assert(hypre_CSRMatrixNumCols(E_diag) == hypre_CSRMatrixNumCols(A_diag)); @@ -1192,8 +1192,8 @@ hypre_ParCSRMatrixDropSmallEntriesDevice( hypre_ParCSRMatrix *A, elmt_tols_offd = hypre_TAlloc(HYPRE_Real, hypre_CSRMatrixNumNonzeros(A_offd), HYPRE_MEMORY_DEVICE); } - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim); if (type == -1) { diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c index e4a5c98e74..9b428d0553 100644 --- a/src/seq_mv/csr_matop_device.c +++ b/src/seq_mv/csr_matop_device.c @@ -653,8 +653,8 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) HYPRE_Int *A_j = hypre_CSRMatrixJ(A); dim3 bDim, gDim; - bDim = hypre_GetDefaultCUDABlockDimension(); - gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim); + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMoveDiagFirst, gDim, bDim, nrows, A_i, A_j, A_data); @@ -689,8 +689,8 @@ hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A ) return 0; } - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim); HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRCheckDiagFirst, gDim, bDim, @@ -778,8 +778,8 @@ hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A, return ierr; } - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); #if HYPRE_DEBUG HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); @@ -873,8 +873,8 @@ hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, return ierr; } - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); #if HYPRE_DEBUG HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); @@ -1072,8 +1072,8 @@ hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, HYPRE_Int *A_j = hypre_CSRMatrixJ(A); dim3 bDim, gDim; - bDim = hypre_GetDefaultCUDABlockDimension(); - gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim); + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); if (type == 0) { @@ -1179,8 +1179,8 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A, HYPRE_Int *A_j = hypre_CSRMatrixJ(A); dim3 bDim, gDim; - bDim = hypre_GetDefaultCUDABlockDimension(); - gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim); + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type ); @@ -1449,8 +1449,8 @@ hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A, hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(nnzA + nnzB, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixIntersectPattern, gDim, bDim, nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt ); diff --git a/src/seq_mv/csr_spgemm_device_util.c b/src/seq_mv/csr_spgemm_device_util.c index 63270e43b4..9514be1f1a 100644 --- a/src/seq_mv/csr_spgemm_device_util.c +++ b/src/seq_mv/csr_spgemm_device_util.c @@ -97,19 +97,19 @@ hypre_SpGemmCreateGlobalHashTable( HYPRE_Int num_rows, /* number of hypre_assert(type == 2 || num_ghash <= num_rows); HYPRE_Int *ghash_i, ghash_size; - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); if (type == 1) { ghash_i = hypre_TAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_ghash, "thread", bDim); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_ghash, "thread", bDim); HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim, num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE ); } else if (type == 2) { ghash_i = hypre_CTAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_rows, "thread", bDim); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim); HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim, num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE ); } diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index e48daf8bf2..39a897a9a9 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -794,8 +794,8 @@ BoxLoopforall( HYPRE_Int length, } else if (exec_policy == HYPRE_EXEC_DEVICE) { - const dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - const dim3 gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); } @@ -845,8 +845,8 @@ ReductionBoxLoopforall( HYPRE_Int length, } else if (exec_policy == HYPRE_EXEC_DEVICE) { - const dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); /* Note: we assume gDim cannot exceed 1024 * and bDim < WARP * WARP @@ -1210,8 +1210,8 @@ BoxLoopforall( HYPRE_Int length, else if (exec_policy == HYPRE_EXEC_DEVICE) { /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */ - const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); - const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { @@ -1247,8 +1247,8 @@ ReductionBoxLoopforall( LOOP_BODY loop_body, { /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */ /* NOTE: in the cuda version, there is further manipulation of bDim and gDim that I don't include here */ - const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); - const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { diff --git a/src/struct_mv/boxloop_cuda.h b/src/struct_mv/boxloop_cuda.h index a5e54462c6..cd477fe2eb 100644 --- a/src/struct_mv/boxloop_cuda.h +++ b/src/struct_mv/boxloop_cuda.h @@ -70,8 +70,8 @@ BoxLoopforall( HYPRE_Int length, } else if (exec_policy == HYPRE_EXEC_DEVICE) { - const dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - const dim3 gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); } @@ -121,8 +121,8 @@ ReductionBoxLoopforall( HYPRE_Int length, } else if (exec_policy == HYPRE_EXEC_DEVICE) { - const dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); /* Note: we assume gDim cannot exceed 1024 * and bDim < WARP * WARP diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index 02c90e6331..1527f3f1b1 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -58,8 +58,8 @@ BoxLoopforall( HYPRE_Int length, else if (exec_policy == HYPRE_EXEC_DEVICE) { /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */ - const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); - const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { @@ -95,8 +95,8 @@ ReductionBoxLoopforall( LOOP_BODY loop_body, { /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */ /* NOTE: in the cuda version, there is further manipulation of bDim and gDim that I don't include here */ - const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); - const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); + const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 4f062c7c06..d8276869d7 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -523,9 +523,9 @@ hypre_int hypre_cuda_get_grid_warp_id(sycl::nd_item& item) } /* device_utils.c */ -sycl::range<1> hypre_GetDefaultCUDABlockDimension(); +sycl::range<1> hypre_GetDefaultDeviceBlockDimension(); -sycl::range<1> hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, sycl::range<1> bDim ); +sycl::range<1> hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, sycl::range<1> bDim ); #endif // #if defined(HYPRE_USING_SYCL) @@ -1106,9 +1106,9 @@ struct print_functor }; /* device_utils.c */ -dim3 hypre_GetDefaultCUDABlockDimension(); +dim3 hypre_GetDefaultDeviceBlockDimension(); -dim3 hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim ); +dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim ); template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *vals, HYPRE_Int opt); diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index 8dd1092508..72a30c73be 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -9,14 +9,13 @@ #include "_hypre_utilities.hpp" #if defined(HYPRE_USING_SYCL) -sycl::range<1> hypre_GetDefaultCUDABlockDimension() +sycl::range<1> hypre_GetDefaultDeviceBlockDimension() { sycl::range<1> wgDim(hypre_HandleDeviceMaxWorkGroupSize(hypre_handle())); return wgDim; } -// WM: TODO: verify -sycl::range<1> hypre_GetDefaultCUDAGridDimension(HYPRE_Int n, +sycl::range<1> hypre_GetDefaultDeviceGridDimension(HYPRE_Int n, const char *granularity, sycl::range<1> wgDim) { @@ -110,7 +109,7 @@ void hypre_CudaCompileFlagCheck() } dim3 -hypre_GetDefaultCUDABlockDimension() +hypre_GetDefaultDeviceBlockDimension() { dim3 bDim(512, 1, 1); @@ -118,7 +117,7 @@ hypre_GetDefaultCUDABlockDimension() } dim3 -hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, +hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim ) { @@ -182,8 +181,8 @@ HYPRE_Int hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, HYPRE_Int *d_offd_ia, HYPRE_Int *d_rownnz) { - const dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - const dim3 gDim = hypre_GetDefaultCUDAGridDimension(nrows, "thread", bDim); + const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + const dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "thread", bDim); /* trivial case */ if (nrows <= 0) @@ -321,8 +320,8 @@ hypreDevice_CopyParCSRRows(HYPRE_Int nrows, hypre_assert(!(nrows > 1 && d_ib == NULL)); - const dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - const dim3 gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim); + const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + const dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); /* if (job == 2) @@ -570,8 +569,8 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea hypre_assert(reduced_n == new_end.second - reduced_y); - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(reduced_n, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(reduced_n, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim, reduced_n, x, reduced_map, reduced_y ); @@ -613,8 +612,8 @@ hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v) return hypre_error_flag; } - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v ); @@ -645,8 +644,8 @@ hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Comple return hypre_error_flag; } - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y ); @@ -677,8 +676,8 @@ hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_ return hypre_error_flag; } - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val ); @@ -714,8 +713,8 @@ hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, return hypre_error_flag; } - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y ); @@ -747,8 +746,8 @@ hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, return hypre_error_flag; } - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z ); @@ -771,8 +770,8 @@ hypreCUDAKernel_BigToSmallCopy( HYPRE_Int* __restrict__ tgt, HYPRE_Int hypreDevice_BigToSmallCopy(HYPRE_Int *tgt, const HYPRE_BigInt *src, HYPRE_Int size) { - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(size, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(size, "thread", bDim); HYPRE_CUDA_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size); diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 7123aefaaf..9127222d8b 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -466,9 +466,9 @@ hypre_int hypre_cuda_get_grid_warp_id(sycl::nd_item& item) } /* device_utils.c */ -sycl::range<1> hypre_GetDefaultCUDABlockDimension(); +sycl::range<1> hypre_GetDefaultDeviceBlockDimension(); -sycl::range<1> hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, sycl::range<1> bDim ); +sycl::range<1> hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, sycl::range<1> bDim ); #endif // #if defined(HYPRE_USING_SYCL) @@ -1049,9 +1049,9 @@ struct print_functor }; /* device_utils.c */ -dim3 hypre_GetDefaultCUDABlockDimension(); +dim3 hypre_GetDefaultDeviceBlockDimension(); -dim3 hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim ); +dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim ); template HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *vals, HYPRE_Int opt); From f48eec0cdc6baefa7cb9ce60fc9fadaa54cf26cf Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Fri, 29 Oct 2021 21:25:56 +0000 Subject: [PATCH 26/44] Try different formulation of reduction Uses shared memory pointer instead of buffers and accessors. Seems to work on iris, same error as before on arcticus. --- src/struct_mv/_hypre_struct_mv.hpp | 19 +++++++++++-------- src/struct_mv/boxloop_sycl.h | 19 +++++++++++-------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index 39a897a9a9..ee421fb85d 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1224,7 +1224,7 @@ template void ReductionBoxLoopforall( LOOP_BODY loop_body, HYPRE_Int length, - sycl::buffer sum_buf ) + HYPRE_Real *hypre_sycl_sum ) { if (length <= 0) { @@ -1252,8 +1252,7 @@ ReductionBoxLoopforall( LOOP_BODY loop_body, hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { - sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write); - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ext::oneapi::reduction(sum_acc, std::plus<>()), loop_body); + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::reduction(hypre_sycl_sum, std::plus<>()), loop_body); }).wait_and_throw(); } } @@ -1506,7 +1505,8 @@ else \ { \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - sycl::buffer sum_buf(&sum_var, 1); \ + HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); \ + hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); \ ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ @@ -1517,7 +1517,8 @@ else \ #define hypre_newBoxLoop1ReductionEnd(i1, sum_var) \ } \ - }, hypre__tot, sum_buf); \ + }, hypre__tot, hypre_sycl_sum); \ + hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); \ } /* Reduction BoxLoop2 */ @@ -1529,7 +1530,8 @@ else \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ - sycl::buffer sum_buf(&sum_var, 1); \ + HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); \ + hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); \ ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ @@ -1541,7 +1543,8 @@ else \ #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var) \ } \ - }, hypre__tot, sum_buf); \ + }, hypre__tot, hypre_sycl_sum); \ + hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); \ } /* Plain parallel_for loop */ @@ -1556,7 +1559,7 @@ else \ #define hypre_LoopEnd() \ } \ - }); \ + }); \ } diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index 1527f3f1b1..af8f1d9f9d 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -72,7 +72,7 @@ template void ReductionBoxLoopforall( LOOP_BODY loop_body, HYPRE_Int length, - sycl::buffer sum_buf ) + HYPRE_Real *hypre_sycl_sum ) { if (length <= 0) { @@ -100,8 +100,7 @@ ReductionBoxLoopforall( LOOP_BODY loop_body, hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) { - sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write); - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ext::oneapi::reduction(sum_acc, std::plus<>()), loop_body); + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::reduction(hypre_sycl_sum, std::plus<>()), loop_body); }).wait_and_throw(); } } @@ -354,7 +353,8 @@ else \ { \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - sycl::buffer sum_buf(&sum_var, 1); \ + HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); \ + hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); \ ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ @@ -365,7 +365,8 @@ else \ #define hypre_newBoxLoop1ReductionEnd(i1, sum_var) \ } \ - }, hypre__tot, sum_buf); \ + }, hypre__tot, hypre_sycl_sum); \ + hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); \ } /* Reduction BoxLoop2 */ @@ -377,7 +378,8 @@ else \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ - sycl::buffer sum_buf(&sum_var, 1); \ + HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); \ + hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); \ ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ @@ -389,7 +391,8 @@ else \ #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var) \ } \ - }, hypre__tot, sum_buf); \ + }, hypre__tot, hypre_sycl_sum); \ + hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); \ } /* Plain parallel_for loop */ @@ -404,7 +407,7 @@ else \ #define hypre_LoopEnd() \ } \ - }); \ + }); \ } From 980bee52687b189673e33224db214b066f117c21 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Tue, 2 Nov 2021 18:16:49 +0000 Subject: [PATCH 27/44] Autoconf clean up --- src/config/HYPRE_config.h.in | 15 +++- src/config/configure.in | 74 +++++------------- src/configure | 143 +++++++++++++++-------------------- src/test/Makefile | 8 +- 4 files changed, 96 insertions(+), 144 deletions(-) diff --git a/src/config/HYPRE_config.h.in b/src/config/HYPRE_config.h.in index fd9398adfa..e9c034d3fd 100644 --- a/src/config/HYPRE_config.h.in +++ b/src/config/HYPRE_config.h.in @@ -196,9 +196,6 @@ /* HIP being used */ #undef HYPRE_USING_HIP -/* SYCL being used */ -#undef HYPRE_USING_SYCL - /* Define to 1 if using host memory only */ #undef HYPRE_USING_HOST_MEMORY @@ -220,6 +217,15 @@ /* NVTX being used */ #undef HYPRE_USING_NVTX +/* onemkl::BLAS being used */ +#undef HYPRE_USING_ONEMKLBLAS + +/* onemkl::rng being used */ +#undef HYPRE_USING_ONEMKLRAND + +/* onemkl::SPARSE being used */ +#undef HYPRE_USING_ONEMKLSPARSE + /* Enable OpenMP support */ #undef HYPRE_USING_OPENMP @@ -241,6 +247,9 @@ /* Define to 1 if using AMD rocTX profiling */ #undef HYPRE_USING_ROCTX +/* SYCL being used */ +#undef HYPRE_USING_SYCL + /* Define to 1 if using UMPIRE */ #undef HYPRE_USING_UMPIRE diff --git a/src/config/configure.in b/src/config/configure.in index 4f282ead8a..3c1f0b16fe 100644 --- a/src/config/configure.in +++ b/src/config/configure.in @@ -204,8 +204,6 @@ hypre_using_onemklsparse=no hypre_using_onemklblas=no hypre_using_onemklrand=no -hypre_found_sycl=no - dnl ********************************************************************* dnl * Initialize flag-check variables @@ -1568,6 +1566,11 @@ then then AC_CHECK_PROGS(CUCC, hipcc) fi + + if test "$hypre_using_sycl" = "yes" + then + AC_CHECK_PROGS(CUCC, dpcpp) + fi fi dnl ********************************************************************* @@ -1976,9 +1979,9 @@ fi if [test "x$hypre_using_um" = "xyes"] then - if [test "x$hypre_using_cuda" != "xyes" && test "x$hypre_using_device_openmp" != "xyes" && test "x$hypre_using_hip" != "xyes"] + if [test "x$hypre_using_cuda" != "xyes" && test "x$hypre_using_device_openmp" != "xyes" && test "x$hypre_using_hip" != "xyes" && test "x$hypre_using_sycl" != "xyes"] then - AC_MSG_ERROR([Asked for unified memory, but not using CUDA, HIP, or device OpenMP!]) + AC_MSG_ERROR([Asked for unified memory, but not using CUDA, HIP, SYCL, or device OpenMP!]) fi fi dnl hypre_using_um @@ -2019,27 +2022,6 @@ AS_IF([ test x"$hypre_using_hip" == x"yes" ], [AC_MSG_ERROR([unable to find ${HYPRE_ROCM_PREFIX}/include/hip/hip_common.h ... Ensure ROCm is installed and set ROCM_PATH environment variable to ROCm installation path.])] )], []) -dnl ********************************************************************* -dnl * Check for SYCL header -dnl ********************************************************************* - -dnl If the user has requested to use SYCL, we first check the environment -dnl for ONEAPI_PATH to point at the oneAPI installation. If that is not found, -dnl then we default to `/opt/intel/oneapi`. -dnl -dnl TODO: Add an ARG_WITH for sycl so the user can control the oneAPI path -dnl through the configure line -AS_IF([ test x"$hypre_using_sycl" == x"yes" ], - [ AS_IF([ test -n "$ONEAPI_PATH"], - [ HYPRE_SYCL_PREFIX=$ONEAPI_PATH ], - [ HYPRE_SYCL_PREFIX=/opt/intel/oneapi ]) - - AC_SUBST(HYPRE_SYCL_PREFIX) - AC_CHECK_HEADERS( ["${HYPRE_SYCL_PREFIX}/compiler/latest/linux/include/sycl/CL/sycl.hpp"], - [hypre_found_sycl=yes], - [AC_MSG_ERROR([unable to find ${HYPRE_SYCL_PREFIX}/compiler/latest/linux/include/sycl/CL/sycl.hpp ... Ensure oneAPI SDK is installed and set ONEAPI_PATH environment variable to oneAPI installation path.])] )], - []) - dnl ********************************************************************* dnl * Set raja options dnl ********************************************************************* @@ -2317,43 +2299,22 @@ AS_IF([test x"$hypre_using_hip" == x"yes"], dnl ********************************************************************* dnl * Set SYCL options dnl ********************************************************************* -AS_IF([test x"$hypre_user_chose_sycl" == x"yes"], +AS_IF([test x"$hypre_using_sycl" == x"yes"], [ AC_DEFINE(HYPRE_USING_GPU, 1, [Define to 1 if executing on GPU device]) AC_DEFINE(HYPRE_USING_SYCL, 1, [SYCL being used]) - dnl The actual invocation of the clang compiler from oneAPI that - dnl supports SYCL and all the command line foo needed by the compiler. - AC_CHECK_PROGS(CXX, [dpcpp]) + LINK_CC=${CUCC} + LINK_CXX=${CUCC} - dnl (Ab)Using dpcpp when compiling SYCL - LINK_CC=${CXX} - LINK_CXX=${CXX} - - dnl The "-x sycl" is necessary to override the detection of .c files which clang - dnl interprets as C and therefore invokes the C compiler rather than the SYCL part - dnl of clang. Put SYCLCXXFLAGS at the end so the user can override from - dnl from the configure line. - SYCLCXXFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel " + CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel " dnl If not in debug mode, at least -O2, but the user can override with - dnl with SYCLCXXFLAGS on the configure line. If in debug mode, -O0 -Wall + dnl with SYCLFLAGS on the configure line. If in debug mode, -O0 -Wall dnl plus flags for debugging symbols AS_IF([test x"$hypre_using_debug" == x"yes"], - [SYCLCXXFLAGS="-O0 -Wall -g -gdb ${SYCLCXXFLAGS}"], - [SYCLCXXFLAGS="-O2 ${SYCLCXXFLAGS}"],) - - dnl (Ab)Use CXXFLAGS to capture SYCL compilation flags - dnl Put SYCLCXXFLAGS at the end so the user can override the optimization level. - CXXFLAGS="${SYCLCXXFLAGS}" - - dnl dpl, dpct so we need both for Thrust on Intel GPUs. - dnl These are header-only so no linking needed. - HYPRE_SYCL_INCL="-I${ONEAPI_PATH}/dpl/latest/linux/include" - HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${ONEAPI_PATH}/dpcpp-ct/latest/include" - - dnl SYCL library - HYPRE_SYCL_LIBS="-L${HYPRE_SYCL_PREFIX}/lib -lamdsycl64" + [SYCLFLAGS="-O0 -Wall -g ${SYCLFLAGS}"], + [SYCLFLAGS="-O2 ${SYCLFLAGS}"],) AS_IF([test x"$hypre_using_onemklsparse" == x"yes"], [AC_DEFINE(HYPRE_USING_ONEMKLSPARSE, 1, [onemkl::SPARSE being used]) @@ -2367,14 +2328,13 @@ AS_IF([test x"$hypre_user_chose_sycl" == x"yes"], HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/blas.hpp" ]) - dnl onemklrand: random number generation on Intel GPUs AS_IF([test x"$hypre_using_onemklrand" == x"yes"], [AC_DEFINE(HYPRE_USING_ONEMKLRAND, 1, [onemkl::rng being used]) HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/rng.hpp" ]) - ]) dnl AS_IF([test x"$hypre_user_chose_sycl" == x"yes"] + ]) dnl AS_IF([test x"$hypre_using_sycl" == x"yes"] dnl ********************************************************************* @@ -2399,7 +2359,7 @@ then AC_MSG_NOTICE([Use --enable-unified-memory to compile with unified memory.]) AC_MSG_NOTICE([***********************************************************************]) fi - if test "$hypre_user_chose_sycl" = "yes" + if test "$hypre_using_sycl" = "yes" then AC_MSG_NOTICE([***********************************************************]) AC_MSG_NOTICE([Configuring with --with-sycl=yes without unified memory.]) @@ -2494,7 +2454,7 @@ if test "x$hypre_using_um" = "xyes" then AC_DEFINE([HYPRE_USING_UNIFIED_MEMORY],1,[Define to 1 if using unified memory]) else - if [test "x$hypre_using_cuda" = "xyes" || test "x$hypre_using_device_openmp" = "xyes" || test "x$hypre_using_hip" = "xyes"] + if [test "x$hypre_using_cuda" = "xyes" || test "x$hypre_using_device_openmp" = "xyes" || test "x$hypre_using_hip" = "xyes" || test "x$hypre_using_sycl" = "xyes"] then AC_DEFINE([HYPRE_USING_DEVICE_MEMORY],1,[Define to 1 if using device memory without UM]) else diff --git a/src/configure b/src/configure index c7f941f40e..eef3052752 100755 --- a/src/configure +++ b/src/configure @@ -2809,8 +2809,6 @@ hypre_using_onemklsparse=no hypre_using_onemklblas=no hypre_using_onemklrand=no -hypre_found_sycl=no - hypre_blas_lib_old_style=no hypre_blas_lib_dir_old_style=no @@ -4014,13 +4012,12 @@ fi - # Check whether --with-sycl was given. if test "${with_sycl+set}" = set; then : withval=$with_sycl; case "$withval" in -yes) hypre_using_sycl=yes ;; -no) hypre_using_sycl=no ;; -*) hypre_using_sycl=no ;; + yes) hypre_using_sycl=yes ;; + no) hypre_using_sycl=no ;; + *) hypre_using_sycl=no ;; esac else hypre_using_sycl=no @@ -4910,21 +4907,6 @@ then as_fn_error $? "--with-hip and --with-device-openmp are mutually exclusive" "$LINENO" 5 fi -if test "x$hypre_using_cuda" = "xyes" && test "x$hypre_using_sycl" = "xyes" -then - as_fn_error $? "--with-cuda and --with-sycl are mutually exclusive" "$LINENO" 5 -fi - -if test "x$hypre_using_hip" = "xyes" && test "x$hypre_using_sycl" = "xyes" -then - as_fn_error $? "--with-hip and --with-sycl are mutually exclusive" "$LINENO" 5 -fi - -if test "x$hypre_using_device_openmp" = "xyes" && test "x$hypre_using_sycl" = "xyes" -then - as_fn_error $? "--with-device-openmp and --with-sycl are mutually exclusive" "$LINENO" 5 -fi - if test "$hypre_user_chose_cudacompilers" = "no" then @@ -5097,6 +5079,52 @@ done done IFS=$as_save_IFS +fi +fi +CUCC=$ac_cv_prog_CUCC +if test -n "$CUCC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUCC" >&5 +$as_echo "$CUCC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CUCC" && break +done + + fi + + if test "$hypre_using_sycl" = "yes" + then + for ac_prog in dpcpp +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CUCC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CUCC"; then + ac_cv_prog_CUCC="$CUCC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CUCC="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + fi fi CUCC=$ac_cv_prog_CUCC @@ -8626,7 +8654,7 @@ if test "x$hypre_using_um" = "xyes" then if test "x$hypre_using_cuda" != "xyes" && test "x$hypre_using_device_openmp" != "xyes" && test "x$hypre_using_hip" != "xyes" && test "x$hypre_using_sycl" != "xyes" then - as_fn_error $? "Asked for unified memory, but not using CUDA, HIP, or device OpenMP!" "$LINENO" 5 + as_fn_error $? "Asked for unified memory, but not using CUDA, HIP, SYCL, or device OpenMP!" "$LINENO" 5 fi fi if test "$hypre_using_cuda" = "yes" || test "$hypre_using_device_openmp" = "yes" @@ -8831,8 +8859,6 @@ done fi - - if test "x$hypre_using_raja" = "xyes" then @@ -9093,74 +9119,26 @@ fi fi - if test x"$hypre_using_sycl" == x"yes"; then : -$as_echo "#define HYPRE_USING_GPU 1" >>confdefs.h - -$as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h - - for ac_prog in dpcpp -do - # Extract the first word of "$ac_prog", so it can be a program name with args. -set dummy $ac_prog; ac_word=$2 -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if ${ac_cv_prog_CUCC+:} false; then : - $as_echo_n "(cached) " >&6 -else - if test -n "$CUCC"; then - ac_cv_prog_CUCC="$CUCC" # Let the user override the test. -else -as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then - ac_cv_prog_CUCC="$ac_prog" - $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done - done -IFS=$as_save_IFS - -fi -fi -CUCC=$ac_cv_prog_CUCC -if test -n "$CUCC"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUCC" >&5 -$as_echo "$CUCC" >&6; } -else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 -$as_echo "no" >&6; } -fi +$as_echo "#define HYPRE_USING_GPU 1" >>confdefs.h - test -n "$CUCC" && break -done +$as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h - LINK_CC=${CUCC} + LINK_CC=${CUCC} LINK_CXX=${CUCC} - SYCLCXXFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel " + CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel " if test x"$hypre_using_debug" == x"yes"; then : - SYCLCXXFLAGS="-O0 -Wall -g ${SYCLCXXFLAGS}" -elif SYCLCXXFLAGS="-O2 ${SYCLCXXFLAGS}"; then : + SYCLFLAGS="-O0 -Wall -g ${SYCLFLAGS}" +elif SYCLFLAGS="-O2 ${SYCLFLAGS}"; then : fi - CUFLAGS="${SYCLCXXFLAGS}" - - HYPRE_SYCL_INCL="-I${ONEAPI_PATH}/dpl/latest/linux/include" - HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${ONEAPI_PATH}/dpcpp-ct/latest/include" - - if test x"$hypre_using_onemklsparse" == x"yes"; then : $as_echo "#define HYPRE_USING_ONEMKLSPARSE 1" >>confdefs.h @@ -9174,12 +9152,12 @@ fi $as_echo "#define HYPRE_USING_ONEMKLBLAS 1" >>confdefs.h - HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" + HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl" HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/blas.hpp" fi - if test x"$hypre_using_onemklrand" == x"yes"; then : + if test x"$hypre_using_onemklrand" == x"yes"; then : $as_echo "#define HYPRE_USING_ONEMKLRAND 1" >>confdefs.h @@ -9191,8 +9169,6 @@ fi fi - - if test "$hypre_using_um" != "yes" then if test "$hypre_using_cuda" = "yes" @@ -9562,6 +9538,9 @@ $as_echo "#define HYPRE_LINUX 1" >>confdefs.h + + + diff --git a/src/test/Makefile b/src/test/Makefile index 8f5cedba35..42fcddfd93 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -40,8 +40,12 @@ F77_COMPILE_FLAGS = \ MPILIBFLAGS = ${MPILIBDIRS} ${MPILIBS} ${MPIFLAGS} LAPACKLIBFLAGS = ${LAPACKLIBDIRS} ${LAPACKLIBS} BLASLIBFLAGS = ${BLASLIBDIRS} ${BLASLIBS} -# WM: had to add the absolute path to libHYPRE.a for successful compilation on frank -LIBFLAGS = ${LDFLAGS} ${LIBS} ${HYPRE_BUILD_DIR}/lib/libHYPRE.a +# WM: currently have to add the absolute path to libHYPRE.a when building sycl code +ifeq ($(notdir $(firstword ${LINK_CC})), dpcpp) + LIBFLAGS = ${LDFLAGS} ${LIBS} ${HYPRE_BUILD_DIR}/lib/libHYPRE.a +else + LIBFLAGS = ${LDFLAGS} ${LIBS} +endif ifeq ($(notdir $(firstword ${LINK_CC})), nvcc) XLINK = -Xlinker=-rpath,${HYPRE_BUILD_DIR}/lib From 2d5ee90db38b2b5efea2a809bb636f3ba760d4f4 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Tue, 2 Nov 2021 20:10:30 +0000 Subject: [PATCH 28/44] Cleanup boxloops, renamings, make sure tests compile --- src/seq_mv/csr_matop_device.c | 2 + src/seq_mv/csr_sptrans_device.c | 24 +++++++ src/struct_ls/pfmg_setup.c | 56 ++++++++-------- src/struct_mv/_hypre_struct_mv.hpp | 102 ++++++++++++----------------- src/struct_mv/boxloop_sycl.h | 102 ++++++++++++----------------- src/struct_mv/struct_innerprod.c | 2 +- 6 files changed, 141 insertions(+), 147 deletions(-) diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c index 9b428d0553..2734b7aa9b 100644 --- a/src/seq_mv/csr_matop_device.c +++ b/src/seq_mv/csr_matop_device.c @@ -1496,6 +1496,8 @@ hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix *A, hypreDevice_CSRSpTransCusparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data); #elif defined(HYPRE_USING_ROCSPARSE) hypreDevice_CSRSpTransRocsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data); +#elif defined(HYPRE_USING_ONEMKLSPARSE) + hypreDevice_CSRSpTransOnemklsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data); #else hypreDevice_CSRSpTrans(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data); #endif diff --git a/src/seq_mv/csr_sptrans_device.c b/src/seq_mv/csr_sptrans_device.c index d41d38af04..440cce76ec 100644 --- a/src/seq_mv/csr_sptrans_device.c +++ b/src/seq_mv/csr_sptrans_device.c @@ -146,6 +146,18 @@ hypreDevice_CSRSpTransRocsparse(HYPRE_Int m, HYPRE_Int n, HYPR #endif // #if defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_ONEMKLSPARSE) +HYPRE_Int +hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, + HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, + HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, + HYPRE_Int want_data) +{ +/* WM: TODO */ + return hypre_error_flag; +} +#endif // #if defined(HYPRE_USING_ONEMKLSPARSE) + #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) @@ -222,3 +234,15 @@ hypreDevice_CSRSpTrans(HYPRE_Int m, HYPRE_Int n, HYPRE_Int } #endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */ + +#if defined(HYPRE_USING_SYCL) +HYPRE_Int +hypreDevice_CSRSpTrans(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, + HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, + HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, + HYPRE_Int want_data) +{ +/* WM: TODO */ + return hypre_error_flag; +} +#endif // #if defined(HYPRE_USING_SYCL) diff --git a/src/struct_ls/pfmg_setup.c b/src/struct_ls/pfmg_setup.c index 684824f26a..08129ac913 100644 --- a/src/struct_ls/pfmg_setup.c +++ b/src/struct_ls/pfmg_setup.c @@ -1061,7 +1061,7 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcx; + hypre_sycl_sum += tcx; #else cxb += tcx; #endif @@ -1075,7 +1075,7 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cn[Ai] + a_cs[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcy; + hypre_sycl_sum += tcy; #else cyb += tcy; #endif @@ -1089,7 +1089,7 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcx * tcx; + hypre_sycl_sum += tcx * tcx; #else sqcxb += tcx * tcx; #endif @@ -1103,7 +1103,7 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cn[Ai] + a_cs[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcy * tcy; + hypre_sycl_sum += tcy * tcy; #else sqcyb += tcy * tcy; #endif @@ -1266,7 +1266,7 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcx; + hypre_sycl_sum += tcx; #else cxb += tcx; #endif @@ -1280,7 +1280,7 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcy; + hypre_sycl_sum += tcy; #else cyb += tcy; #endif @@ -1294,7 +1294,7 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcx*tcx; + hypre_sycl_sum += tcx*tcx; #else sqcxb += tcx*tcx; #endif @@ -1308,7 +1308,7 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcy*tcy; + hypre_sycl_sum += tcy*tcy; #else sqcyb += tcy*tcy; #endif @@ -1457,7 +1457,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcx; + hypre_sycl_sum += tcx; #else cxb += tcx; #endif @@ -1471,7 +1471,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcy; + hypre_sycl_sum += tcy; #else cyb += tcy; #endif @@ -1485,7 +1485,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcz; + hypre_sycl_sum += tcz; #else czb += tcz; #endif @@ -1499,7 +1499,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcx*tcx; + hypre_sycl_sum += tcx*tcx; #else sqcxb += tcx*tcx; #endif @@ -1513,7 +1513,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcy*tcy; + hypre_sycl_sum += tcy*tcy; #else sqcyb += tcy*tcy; #endif @@ -1527,7 +1527,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcz*tcz; + hypre_sycl_sum += tcz*tcz; #else sqczb += tcz*tcz; #endif @@ -1736,7 +1736,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_aw[Ai] + a_ae[Ai] + a_bw[Ai] + a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcx; + hypre_sycl_sum += tcx; #else cxb += tcx; #endif @@ -1750,7 +1750,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcy; + hypre_sycl_sum += tcy; #else cyb += tcy; #endif @@ -1764,7 +1764,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] + a_bw[Ai] + a_be[Ai] + a_bn[Ai] + a_bs[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcz; + hypre_sycl_sum += tcz; #else czb += tcz; #endif @@ -1778,7 +1778,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_aw[Ai] + a_ae[Ai] + a_bw[Ai] + a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcx*tcx; + hypre_sycl_sum += tcx*tcx; #else sqcxb += tcx*tcx; #endif @@ -1792,7 +1792,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcy*tcy; + hypre_sycl_sum += tcy*tcy; #else sqcyb += tcy*tcy; #endif @@ -1806,7 +1806,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0; HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] + a_bw[Ai] + a_be[Ai] + a_bn[Ai] + a_bs[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcz*tcz; + hypre_sycl_sum += tcz*tcz; #else sqczb += tcz*tcz; #endif @@ -2058,7 +2058,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, tcx -= diag * (a_cw[Ai] + a_ce[Ai] + a_aw[Ai] + a_ae[Ai] + a_bw[Ai] + a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); tcx -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcx; + hypre_sycl_sum += tcx; #else cxb += tcx; #endif @@ -2074,7 +2074,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, tcy -= diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); tcy -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcy; + hypre_sycl_sum += tcy; #else cyb += tcy; #endif @@ -2090,7 +2090,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, tcz -= diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] + a_bw[Ai] + a_be[Ai] + a_bn[Ai] + a_bs[Ai]); tcz -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcz; + hypre_sycl_sum += tcz; #else czb += tcz; #endif @@ -2106,7 +2106,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, tcx -= diag * (a_cw[Ai] + a_ce[Ai] + a_aw[Ai] + a_ae[Ai] + a_bw[Ai] + a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); tcx -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcx*tcx; + hypre_sycl_sum += tcx*tcx; #else sqcxb += tcx*tcx; #endif @@ -2122,7 +2122,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, tcy -= diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); tcy -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcy*tcy; + hypre_sycl_sum += tcy*tcy; #else sqcyb += tcy*tcy; #endif @@ -2138,7 +2138,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, tcz -= diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] + a_bw[Ai] + a_be[Ai] + a_bn[Ai] + a_bs[Ai]); tcz -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); #if defined(HYPRE_USING_SYCL) - sum += tcz*tcz; + hypre_sycl_sum += tcz*tcz; #else sqczb += tcz*tcz; #endif @@ -2310,7 +2310,7 @@ hypre_ZeroDiagonal( hypre_StructMatrix *A ) if (Ap[Ai] == 0.0) { #if defined(HYPRE_USING_SYCL) - sum += one; + hypre_sycl_sum += one; #else diag_product_local += one; #endif @@ -2318,7 +2318,7 @@ hypre_ZeroDiagonal( hypre_StructMatrix *A ) else { #if defined(HYPRE_USING_SYCL) - sum += zero; + hypre_sycl_sum += zero; #else diag_product_local += zero; #endif diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index ee421fb85d..d76cab3557 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1192,69 +1192,36 @@ void BoxLoopforall( HYPRE_Int length, LOOP_BODY loop_body) { - /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ - /* WM: TODO: uncomment above and remove below */ - HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; - - if (exec_policy == HYPRE_EXEC_HOST) - { -/* WM: todo - is this really necessary, even? */ -/* #ifdef HYPRE_USING_OPENMP */ -/* #pragma omp parallel for HYPRE_SMP_SCHEDULE */ -/* #endif */ -/* for (HYPRE_Int idx = 0; idx < length; idx++) */ -/* { */ -/* loop_body(idx); */ -/* } */ - } - else if (exec_policy == HYPRE_EXEC_DEVICE) + if (length <= 0) { - /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */ - const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); - const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); - - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) - { - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body); - }).wait_and_throw(); + return; } + const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); + + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) + { + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body); + }).wait_and_throw(); } template void ReductionBoxLoopforall( LOOP_BODY loop_body, HYPRE_Int length, - HYPRE_Real *hypre_sycl_sum ) + HYPRE_Real *shared_sum_var ) { if (length <= 0) { return; } + const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); - /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ - /* WM: TODO: uncomment above and remove below */ - HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; - - if (exec_policy == HYPRE_EXEC_HOST) - { - /* WM: todo - is this really necessary, even? */ - /* for (HYPRE_Int idx = 0; idx < length; idx++) */ - /* { */ - /* loop_body(idx, reducer); */ - /* } */ - } - else if (exec_policy == HYPRE_EXEC_DEVICE) - { - /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */ - /* NOTE: in the cuda version, there is further manipulation of bDim and gDim that I don't include here */ - const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); - const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); - - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) - { - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::reduction(hypre_sycl_sum, std::plus<>()), loop_body); - }).wait_and_throw(); - } + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) + { + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::reduction(shared_sum_var, std::plus<>()), loop_body); + }).wait_and_throw(); } #ifdef __cplusplus @@ -1377,6 +1344,21 @@ else \ * Boxloops *********************************************************************/ +/* BoxLoop 0 */ +#define hypre_newBoxLoop0Begin(ndim, loop_size) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + +#define hypre_newBoxLoop0End() \ + } \ + }); \ +} + /* BoxLoop 1 */ #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ { \ @@ -1505,9 +1487,9 @@ else \ { \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); \ - hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); \ - ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ + HYPRE_Real *shared_sum_var = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); \ + hypre_TMemcpy(shared_sum_var, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); \ + ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &hypre_sycl_sum) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -1517,8 +1499,8 @@ else \ #define hypre_newBoxLoop1ReductionEnd(i1, sum_var) \ } \ - }, hypre__tot, hypre_sycl_sum); \ - hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); \ + }, hypre__tot, shared_sum_var); \ + hypre_TMemcpy(&sum_var, shared_sum_var, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); \ } /* Reduction BoxLoop2 */ @@ -1530,9 +1512,9 @@ else \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ - HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); \ - hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); \ - ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ + HYPRE_Real *shared_sum_var = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); \ + hypre_TMemcpy(shared_sum_var, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); \ + ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &hypre_sycl_sum) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -1543,8 +1525,8 @@ else \ #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var) \ } \ - }, hypre__tot, hypre_sycl_sum); \ - hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); \ + }, hypre__tot, shared_sum_var); \ + hypre_TMemcpy(&sum_var, shared_sum_var, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); \ } /* Plain parallel_for loop */ @@ -1569,6 +1551,8 @@ else \ #define hypre_BoxLoopBlock() 0 +#define hypre_BoxLoop0Begin hypre_newBoxLoop0Begin +#define hypre_BoxLoop0End hypre_newBoxLoop0End #define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin #define hypre_BoxLoop1End hypre_newBoxLoop1End #define hypre_BoxLoop2Begin hypre_newBoxLoop2Begin diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index af8f1d9f9d..b8a61a07ea 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -40,69 +40,36 @@ void BoxLoopforall( HYPRE_Int length, LOOP_BODY loop_body) { - /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ - /* WM: TODO: uncomment above and remove below */ - HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; - - if (exec_policy == HYPRE_EXEC_HOST) - { -/* WM: todo - is this really necessary, even? */ -/* #ifdef HYPRE_USING_OPENMP */ -/* #pragma omp parallel for HYPRE_SMP_SCHEDULE */ -/* #endif */ -/* for (HYPRE_Int idx = 0; idx < length; idx++) */ -/* { */ -/* loop_body(idx); */ -/* } */ - } - else if (exec_policy == HYPRE_EXEC_DEVICE) + if (length <= 0) { - /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */ - const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); - const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); - - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) - { - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body); - }).wait_and_throw(); + return; } + const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); + + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) + { + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body); + }).wait_and_throw(); } template void ReductionBoxLoopforall( LOOP_BODY loop_body, HYPRE_Int length, - HYPRE_Real *hypre_sycl_sum ) + HYPRE_Real *shared_sum_var ) { if (length <= 0) { return; } + const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); - /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */ - /* WM: TODO: uncomment above and remove below */ - HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE; - - if (exec_policy == HYPRE_EXEC_HOST) - { - /* WM: todo - is this really necessary, even? */ - /* for (HYPRE_Int idx = 0; idx < length; idx++) */ - /* { */ - /* loop_body(idx, reducer); */ - /* } */ - } - else if (exec_policy == HYPRE_EXEC_DEVICE) - { - /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */ - /* NOTE: in the cuda version, there is further manipulation of bDim and gDim that I don't include here */ - const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); - const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); - - hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) - { - cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::reduction(hypre_sycl_sum, std::plus<>()), loop_body); - }).wait_and_throw(); - } + hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) + { + cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::reduction(shared_sum_var, std::plus<>()), loop_body); + }).wait_and_throw(); } #ifdef __cplusplus @@ -225,6 +192,21 @@ else \ * Boxloops *********************************************************************/ +/* BoxLoop 0 */ +#define hypre_newBoxLoop0Begin(ndim, loop_size) \ +{ \ + hypre_newBoxLoopInit(ndim, loop_size); \ + BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item) \ + { \ + HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ + if (idx < hypre__tot) \ + { \ + +#define hypre_newBoxLoop0End() \ + } \ + }); \ +} + /* BoxLoop 1 */ #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1) \ { \ @@ -353,9 +335,9 @@ else \ { \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ - HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); \ - hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); \ - ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ + HYPRE_Real *shared_sum_var = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); \ + hypre_TMemcpy(shared_sum_var, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); \ + ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &hypre_sycl_sum) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -365,8 +347,8 @@ else \ #define hypre_newBoxLoop1ReductionEnd(i1, sum_var) \ } \ - }, hypre__tot, hypre_sycl_sum); \ - hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); \ + }, hypre__tot, shared_sum_var); \ + hypre_TMemcpy(&sum_var, shared_sum_var, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); \ } /* Reduction BoxLoop2 */ @@ -378,9 +360,9 @@ else \ hypre_newBoxLoopInit(ndim, loop_size); \ hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1); \ hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2); \ - HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); \ - hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); \ - ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum) \ + HYPRE_Real *shared_sum_var = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); \ + hypre_TMemcpy(shared_sum_var, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); \ + ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &hypre_sycl_sum) \ { \ HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id(); \ if (idx < hypre__tot) \ @@ -391,8 +373,8 @@ else \ #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var) \ } \ - }, hypre__tot, hypre_sycl_sum); \ - hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); \ + }, hypre__tot, shared_sum_var); \ + hypre_TMemcpy(&sum_var, shared_sum_var, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); \ } /* Plain parallel_for loop */ @@ -417,6 +399,8 @@ else \ #define hypre_BoxLoopBlock() 0 +#define hypre_BoxLoop0Begin hypre_newBoxLoop0Begin +#define hypre_BoxLoop0End hypre_newBoxLoop0End #define hypre_BoxLoop1Begin hypre_newBoxLoop1Begin #define hypre_BoxLoop1End hypre_newBoxLoop1End #define hypre_BoxLoop2Begin hypre_newBoxLoop2Begin diff --git a/src/struct_mv/struct_innerprod.c b/src/struct_mv/struct_innerprod.c index cfef661cb0..7d1c7e15ba 100644 --- a/src/struct_mv/struct_innerprod.c +++ b/src/struct_mv/struct_innerprod.c @@ -90,7 +90,7 @@ hypre_StructInnerProd( hypre_StructVector *x, { HYPRE_Real tmp = xp[xi] * hypre_conj(yp[yi]); #if defined(HYPRE_USING_SYCL) - sum += tmp; + hypre_sycl_sum += tmp; #else box_sum += tmp; #endif From 4d303d3d5f3f6bd4ce3a9d778158a1cb525fdd8e Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Wed, 3 Nov 2021 22:14:40 +0000 Subject: [PATCH 29/44] Some placeholders and changes to allow ij interface to run on the host --- src/parcsr_ls/par_lr_interp.c | 30 ++++++++++++++---------------- src/parcsr_ls/par_mod_lr_interp.c | 17 ++++++++--------- src/seq_mv/csr_matvec.c | 4 ++++ src/seq_mv/csr_matvec_device.c | 17 +++++++++++++++++ src/seq_mv/protos.h | 3 +++ src/seq_mv/seq_mv.h | 3 +++ 6 files changed, 49 insertions(+), 25 deletions(-) diff --git a/src/parcsr_ls/par_lr_interp.c b/src/parcsr_ls/par_lr_interp.c index 9dce84705c..da45ec1a4b 100644 --- a/src/parcsr_ls/par_lr_interp.c +++ b/src/parcsr_ls/par_lr_interp.c @@ -5283,22 +5283,21 @@ hypre_BoomerAMGBuildExtInterp(hypre_ParCSRMatrix *A, HYPRE_Int *CF_marker, hypre_GpuProfilingPushRange("ExtInterp"); #endif - HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); - HYPRE_Int ierr = 0; - if (exec == HYPRE_EXEC_HOST) - { - ierr = hypre_BoomerAMGBuildExtInterpHost(A,CF_marker,S,num_cpts_global,num_functions,dof_func, - debug_flag,trunc_factor,max_elmts,P_ptr); - } #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - else + HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); + if (exec == HYPRE_EXEC_DEVICE) { ierr = hypre_BoomerAMGBuildExtInterpDevice(A,CF_marker,S,num_cpts_global,num_functions,dof_func, debug_flag,trunc_factor,max_elmts,P_ptr); } + else #endif + { + ierr = hypre_BoomerAMGBuildExtInterpHost(A,CF_marker,S,num_cpts_global,num_functions,dof_func, + debug_flag,trunc_factor,max_elmts,P_ptr); + } #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_GpuProfilingPopRange(); @@ -5325,22 +5324,21 @@ hypre_BoomerAMGBuildExtPIInterp(hypre_ParCSRMatrix *A, hypre_GpuProfilingPushRange("ExtPIInterp"); #endif - HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); - HYPRE_Int ierr = 0; - if (exec == HYPRE_EXEC_HOST) - { - ierr = hypre_BoomerAMGBuildExtPIInterpHost(A, CF_marker, S, num_cpts_global, num_functions, dof_func, - debug_flag, trunc_factor, max_elmts, P_ptr); - } #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - else + HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); + if (exec == HYPRE_EXEC_DEVICE) { ierr = hypre_BoomerAMGBuildExtPIInterpDevice(A, CF_marker, S, num_cpts_global, num_functions, dof_func, debug_flag, trunc_factor, max_elmts, P_ptr); } + else #endif + { + ierr = hypre_BoomerAMGBuildExtPIInterpHost(A, CF_marker, S, num_cpts_global, num_functions, dof_func, + debug_flag, trunc_factor, max_elmts, P_ptr); + } #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_GpuProfilingPopRange(); diff --git a/src/parcsr_ls/par_mod_lr_interp.c b/src/parcsr_ls/par_mod_lr_interp.c index bc0ecbe031..7cd946acd8 100644 --- a/src/parcsr_ls/par_mod_lr_interp.c +++ b/src/parcsr_ls/par_mod_lr_interp.c @@ -1170,23 +1170,22 @@ hypre_BoomerAMGBuildModExtPIInterp(hypre_ParCSRMatrix *A, hypre_GpuProfilingPushRange("ModExtPIInterp"); #endif - HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); - HYPRE_Int ierr = 0; - if (exec == HYPRE_EXEC_HOST) - { - ierr = hypre_BoomerAMGBuildModExtPIInterpHost(A, CF_marker, S, num_cpts_global, - debug_flag, num_functions, dof_func, - trunc_factor, max_elmts, P_ptr); - } #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - else + HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) ); + if (exec == HYPRE_EXEC_DEVICE) { ierr = hypre_BoomerAMGBuildExtPIInterpDevice(A, CF_marker, S, num_cpts_global, 1, NULL, debug_flag, trunc_factor, max_elmts, P_ptr); } + else #endif + { + ierr = hypre_BoomerAMGBuildModExtPIInterpHost(A, CF_marker, S, num_cpts_global, + debug_flag, num_functions, dof_func, + trunc_factor, max_elmts, P_ptr); + } #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_GpuProfilingPopRange(); diff --git a/src/seq_mv/csr_matvec.c b/src/seq_mv/csr_matvec.c index 90f57d44da..b86d1431bd 100644 --- a/src/seq_mv/csr_matvec.c +++ b/src/seq_mv/csr_matvec.c @@ -712,6 +712,10 @@ hypre_CSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, #if defined(HYPRE_USING_GPU) HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_CSRMatrixMemoryLocation(A) ); +/* WM: TODO - remove after sycl implementation in place */ +#if defined(HYPRE_USING_SYCL) + exec = HYPRE_EXEC_HOST; +#endif if (exec == HYPRE_EXEC_DEVICE) { ierr = hypre_CSRMatrixMatvecDevice(0, alpha, A, x, beta, b, y, offset); diff --git a/src/seq_mv/csr_matvec_device.c b/src/seq_mv/csr_matvec_device.c index 5ead8cb9b8..a6a09363e6 100644 --- a/src/seq_mv/csr_matvec_device.c +++ b/src/seq_mv/csr_matvec_device.c @@ -50,6 +50,8 @@ hypre_CSRMatrixMatvecDevice2( HYPRE_Int trans, hypre_CSRMatrixMatvecOMPOffload(trans, alpha, A, x, beta, y, offset); #elif defined(HYPRE_USING_ROCSPARSE) hypre_CSRMatrixMatvecRocsparse(trans, alpha, A, x, beta, y, offset); +#elif defined(HYPRE_USING_ONEMKLSPARSE) + hypre_CSRMatrixMatvecOnemklsparse(trans, alpha, A, x, beta, y, offset); #else // #ifdef HYPRE_USING_CUSPARSE // WM: TODO: commenting this out for now, but put it back after sycl impelentation is done /* #error HYPRE SPMV TODO */ @@ -314,5 +316,20 @@ hypre_CSRMatrixMatvecRocsparse( HYPRE_Int trans, } #endif // #if defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_ONEMKLSPARSE) +HYPRE_Int +hypre_CSRMatrixMatvecOnemklsparse( HYPRE_Int trans, + HYPRE_Complex alpha, + hypre_CSRMatrix *A, + hypre_Vector *x, + HYPRE_Complex beta, + hypre_Vector *y, + HYPRE_Int offset ) +{ +/* WM: TODO */ + return hypre_error_flag; +} +#endif // #if defined(HYPRE_USING_ROCSPARSE) + #endif // #if defined(HYPRE_USING_GPU) diff --git a/src/seq_mv/protos.h b/src/seq_mv/protos.h index f52e3836ed..898efc6154 100644 --- a/src/seq_mv/protos.h +++ b/src/seq_mv/protos.h @@ -89,6 +89,7 @@ HYPRE_Int hypre_CSRMatrixMatvecCusparseNewAPI( HYPRE_Int trans, HYPRE_Complex al HYPRE_Int hypre_CSRMatrixMatvecCusparseOldAPI( HYPRE_Int trans, HYPRE_Complex alpha, hypre_CSRMatrix *A, hypre_Vector *x, HYPRE_Complex beta, hypre_Vector *y, HYPRE_Int offset ); HYPRE_Int hypre_CSRMatrixMatvecOMPOffload (HYPRE_Int trans, HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y, HYPRE_Int offset ); HYPRE_Int hypre_CSRMatrixMatvecRocsparse (HYPRE_Int trans, HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y, HYPRE_Int offset ); +HYPRE_Int hypre_CSRMatrixMatvecOnemklsparse (HYPRE_Int trans, HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y, HYPRE_Int offset ); /* genpart.c */ HYPRE_Int hypre_GeneratePartitioning ( HYPRE_BigInt length , HYPRE_Int num_procs , HYPRE_BigInt **part_ptr ); @@ -196,6 +197,8 @@ HYPRE_Int hypreDevice_CSRSpTransCusparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnz HYPRE_Int hypreDevice_CSRSpTransRocsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data); +HYPRE_Int hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data); + HYPRE_Int hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, hypre_CSRMatrix *B, hypre_CSRMatrix **C_ptr); HYPRE_Int hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Int *d_ib, HYPRE_Int *d_jb, HYPRE_Int *d_rc); diff --git a/src/seq_mv/seq_mv.h b/src/seq_mv/seq_mv.h index 2964c08a03..257b9a7b38 100644 --- a/src/seq_mv/seq_mv.h +++ b/src/seq_mv/seq_mv.h @@ -361,6 +361,7 @@ HYPRE_Int hypre_CSRMatrixMatvecCusparseNewAPI( HYPRE_Int trans, HYPRE_Complex al HYPRE_Int hypre_CSRMatrixMatvecCusparseOldAPI( HYPRE_Int trans, HYPRE_Complex alpha, hypre_CSRMatrix *A, hypre_Vector *x, HYPRE_Complex beta, hypre_Vector *y, HYPRE_Int offset ); HYPRE_Int hypre_CSRMatrixMatvecOMPOffload (HYPRE_Int trans, HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y, HYPRE_Int offset ); HYPRE_Int hypre_CSRMatrixMatvecRocsparse (HYPRE_Int trans, HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y, HYPRE_Int offset ); +HYPRE_Int hypre_CSRMatrixMatvecOnemklsparse (HYPRE_Int trans, HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y, HYPRE_Int offset ); /* genpart.c */ HYPRE_Int hypre_GeneratePartitioning ( HYPRE_BigInt length , HYPRE_Int num_procs , HYPRE_BigInt **part_ptr ); @@ -468,6 +469,8 @@ HYPRE_Int hypreDevice_CSRSpTransCusparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnz HYPRE_Int hypreDevice_CSRSpTransRocsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data); +HYPRE_Int hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data); + HYPRE_Int hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, hypre_CSRMatrix *B, hypre_CSRMatrix **C_ptr); HYPRE_Int hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Int *d_ib, HYPRE_Int *d_jb, HYPRE_Int *d_rc); From 99c5d9d72c1290375b21963e34699e98ba26b573 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Thu, 4 Nov 2021 21:20:58 +0000 Subject: [PATCH 30/44] Add cmake compilation --- src/CMakeLists.txt | 58 ++++++++++++++++++++- src/IJ_mv/CMakeLists.txt | 8 +-- src/config/HYPREConfig.cmake.in | 1 + src/config/HYPRE_config.h.cmake.in | 3 ++ src/config/cmake/HYPRE_CMakeUtilities.cmake | 6 +++ src/parcsr_ls/CMakeLists.txt | 8 +-- src/parcsr_mv/CMakeLists.txt | 8 +-- src/seq_mv/CMakeLists.txt | 8 +-- src/sstruct_ls/CMakeLists.txt | 8 +-- src/sstruct_mv/CMakeLists.txt | 8 +-- src/struct_ls/CMakeLists.txt | 8 +-- src/struct_mv/CMakeLists.txt | 8 +-- src/utilities/CMakeLists.txt | 8 +-- 13 files changed, 102 insertions(+), 38 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7887360b9e..94a92e72b1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -101,9 +101,11 @@ option(HYPRE_BUILD_TESTS "Build tests" OFF) option(HYPRE_USING_HOST_MEMORY "Use host memory" ON) set(HYPRE_WITH_EXTRA_CFLAGS "" CACHE STRING "Define extra C compile flags") set(HYPRE_WITH_EXTRA_CXXFLAGS "" CACHE STRING "Define extra CXX compile flags") -# CUDA options +# GPU options option(HYPRE_WITH_CUDA "Use CUDA. Require cuda-8.0 or higher" OFF) +option(HYPRE_WITH_SYCL "Use SYCL" OFF) option(HYPRE_ENABLE_UNIFIED_MEMORY "Use unified memory for allocating the memory" OFF) +# CUDA options option(HYPRE_ENABLE_CUDA_STREAMS "Use CUDA streams" ON) option(HYPRE_ENABLE_CUSPARSE "Use cuSPARSE" ON) option(HYPRE_ENABLE_DEVICE_POOL "Use device memory pool" OFF) @@ -280,6 +282,54 @@ if (HYPRE_WITH_CUDA) endif (CMAKE_CUDA_COMPILER) endif (HYPRE_WITH_CUDA) +# SYCL +if (HYPRE_WITH_SYCL) + enable_language(CXX) + message(STATUS "Enabled support for CXX.") + + # Enforce C++17 + if (NOT CMAKE_CXX_STANDARD OR CMAKE_CXX_STANDARD LESS 17) + set(CMAKE_CXX_STANDARD 17) + endif () + set(CMAKE_CXX_STANDARD_REQUIRED ON) + + message(STATUS "Using CXX standard: c++${CMAKE_CXX_STANDARD}") + + # Set CXX compiler to dpcpp + set(CMAKE_CXX_COMPILER "dpcpp") + # WM: try with/without the line below + # set(CMAKE_LINKER "dpcpp") + + # Add any extra CXX compiler flags HYPRE_WITH_EXTRA_CXXFLAGS + if (NOT HYPRE_WITH_EXTRA_CXXFLAGS STREQUAL "") + string(REPLACE " " ";" HYPRE_WITH_EXTRA_CXXFLAGS "${HYPRE_WITH_EXTRA_CXXFLAGS}") + add_compile_options("$<$:${HYPRE_WITH_EXTRA_CXXFLAGS}>") + endif () + + set(HYPRE_USING_SYCL ON CACHE BOOL "" FORCE) + set(HYPRE_USING_GPU ON CACHE BOOL "" FORCE) + + if (HYPRE_ENABLE_UNIFIED_MEMORY) + set(HYPRE_USING_UNIFIED_MEMORY ON CACHE BOOL "" FORCE) + else () + set(HYPRE_USING_DEVICE_MEMORY ON CACHE BOOL "" FORCE) + endif () + + # Check if examples are enabled, but not unified memory + if (HYPRE_BUILD_EXAMPLES AND NOT HYPRE_ENABLE_UNIFIED_MEMORY) + message(WARNING "Running the examples on GPUs requires Unified Memory! + Examples will not be built!") + set(HYPRE_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) + endif () + + add_compile_options("$<$:-fsycl>") + add_compile_options("$<$:-fsycl-unnamed-lambda>") + add_compile_options("$<$:-fsycl-device-code-split=per_kernel>") + + set(HYPRE_USING_HOST_MEMORY OFF CACHE BOOL "" FORCE) + +endif (HYPRE_WITH_SYCL) + # Add any extra C compiler flags HYPRE_WITH_EXTRA_CFLAGS if (NOT HYPRE_WITH_EXTRA_CFLAGS STREQUAL "") string(REPLACE " " ";" HYPRE_WITH_EXTRA_CFLAGS "${HYPRE_WITH_EXTRA_CFLAGS}") @@ -397,7 +447,11 @@ target_include_directories(${PROJECT_NAME} PUBLIC ) if (HYPRE_USING_CUDA) - set_source_files_properties(${HYPRE_CUDA_SOURCES} PROPERTIES LANGUAGE CUDA) + set_source_files_properties(${HYPRE_GPU_SOURCES} PROPERTIES LANGUAGE CUDA) +endif () + +if (HYPRE_USING_SYCL) + set_source_files_properties(${HYPRE_GPU_SOURCES} PROPERTIES LANGUAGE CXX) endif () # Set MPI compile flags diff --git a/src/IJ_mv/CMakeLists.txt b/src/IJ_mv/CMakeLists.txt index 5a7c4d5ec1..ba491c70cf 100644 --- a/src/IJ_mv/CMakeLists.txt +++ b/src/IJ_mv/CMakeLists.txt @@ -34,13 +34,13 @@ target_sources(${PROJECT_NAME} ${HDRS} ) -if (HYPRE_USING_CUDA) - set(CUDA_SRCS +if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL) + set(GPU_SRCS IJMatrix_parcsr_device.c IJVector_parcsr_device.c ) - convert_filenames_to_full_paths(CUDA_SRCS) - set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE) + convert_filenames_to_full_paths(GPU_SRCS) + set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE) endif () convert_filenames_to_full_paths(HDRS) diff --git a/src/config/HYPREConfig.cmake.in b/src/config/HYPREConfig.cmake.in index d9d19cc267..1445fec262 100644 --- a/src/config/HYPREConfig.cmake.in +++ b/src/config/HYPREConfig.cmake.in @@ -28,6 +28,7 @@ set(HYPRE_BUILD_EXAMPLES @HYPRE_BUILD_EXAMPLES@) set(HYPRE_BUILD_TESTS @HYPRE_BUILD_TESTS@) set(HYPRE_USING_HOST_MEMORY @HYPRE_USING_HOST_MEMORY@) set(HYPRE_WITH_CUDA @HYPRE_WITH_CUDA@) +set(HYPRE_WITH_SYCL @HYPRE_WITH_SYCL@) set(HYPRE_ENABLE_UNIFIED_MEMORY @HYPRE_ENABLE_UNIFIED_MEMORY@) set(HYPRE_ENABLE_CUDA_STREAMS @HYPRE_ENABLE_CUDA_STREAMS@) set(HYPRE_ENABLE_CUSPARSE @HYPRE_ENABLE_CUSPARSE@) diff --git a/src/config/HYPRE_config.h.cmake.in b/src/config/HYPRE_config.h.cmake.in index 86006a16bc..eb22ae7336 100644 --- a/src/config/HYPRE_config.h.cmake.in +++ b/src/config/HYPRE_config.h.cmake.in @@ -67,6 +67,9 @@ /* Use if executing on device with CUDA */ #cmakedefine HYPRE_USING_CUDA 1 +/* Use if executing on device with SYCL */ +#cmakedefine HYPRE_USING_SYCL 1 + /* Use cuBLAS */ #cmakedefine HYPRE_USING_CUBLAS 1 diff --git a/src/config/cmake/HYPRE_CMakeUtilities.cmake b/src/config/cmake/HYPRE_CMakeUtilities.cmake index 0a1e8c8be2..97e11c3c1a 100644 --- a/src/config/cmake/HYPRE_CMakeUtilities.cmake +++ b/src/config/cmake/HYPRE_CMakeUtilities.cmake @@ -25,6 +25,12 @@ function(add_hypre_executables EXE_SRCS) set_source_files_properties(${SRC_FILENAME} PROPERTIES LANGUAGE CUDA) endif (HYPRE_USING_CUDA) + if (HYPRE_USING_SYCL) + # If SYCL is enabled, tag source files to be compiled with dpcpp. + set_source_files_properties(${SRC_FILENAME} PROPERTIES LANGUAGE CXX) + endif (HYPRE_USING_SYCL) + + string(REPLACE ".c" "" EXE_NAME ${SRC_FILENAME}) # Actually add the exe add_executable(${EXE_NAME} ${SRC_FILE}) diff --git a/src/parcsr_ls/CMakeLists.txt b/src/parcsr_ls/CMakeLists.txt index 045dea2545..8ce5945fad 100644 --- a/src/parcsr_ls/CMakeLists.txt +++ b/src/parcsr_ls/CMakeLists.txt @@ -143,8 +143,8 @@ target_sources(${PROJECT_NAME} ${HDRS} ) -if (HYPRE_USING_CUDA) - set(CUDA_SRCS +if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL) + set(GPU_SRCS ams.c ads.c ame.c @@ -167,8 +167,8 @@ if (HYPRE_USING_CUDA) par_2s_interp_device.c par_relax_device.c ) - convert_filenames_to_full_paths(CUDA_SRCS) - set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE) + convert_filenames_to_full_paths(GPU_SRCS) + set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE) endif () convert_filenames_to_full_paths(HDRS) diff --git a/src/parcsr_mv/CMakeLists.txt b/src/parcsr_mv/CMakeLists.txt index 6c40d366d1..ad1eca2fc4 100644 --- a/src/parcsr_mv/CMakeLists.txt +++ b/src/parcsr_mv/CMakeLists.txt @@ -43,16 +43,16 @@ target_sources(${PROJECT_NAME} ${HDRS} ) -if (HYPRE_USING_CUDA) - set(CUDA_SRCS +if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL) + set(GPU_SRCS par_csr_matvec.c par_csr_fffc_device.c par_csr_matop_device.c par_csr_triplemat_device.c par_vector_device.c ) - convert_filenames_to_full_paths(CUDA_SRCS) - set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE) + convert_filenames_to_full_paths(GPU_SRCS) + set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE) endif () convert_filenames_to_full_paths(HDRS) diff --git a/src/seq_mv/CMakeLists.txt b/src/seq_mv/CMakeLists.txt index 80942d36bf..af06738e3f 100644 --- a/src/seq_mv/CMakeLists.txt +++ b/src/seq_mv/CMakeLists.txt @@ -43,8 +43,8 @@ target_sources(${PROJECT_NAME} ${HDRS} ) -if (HYPRE_USING_CUDA) - set(CUDA_SRCS +if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL) + set(GPU_SRCS csr_matop_device.c csr_matrix_cuda_utils.c csr_matvec_device.c @@ -62,8 +62,8 @@ if (HYPRE_USING_CUDA) csr_sptrans_device.c vector.c ) - convert_filenames_to_full_paths(CUDA_SRCS) - set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE) + convert_filenames_to_full_paths(GPU_SRCS) + set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE) endif () convert_filenames_to_full_paths(HDRS) diff --git a/src/sstruct_ls/CMakeLists.txt b/src/sstruct_ls/CMakeLists.txt index 344360ce2b..d11a0908dd 100644 --- a/src/sstruct_ls/CMakeLists.txt +++ b/src/sstruct_ls/CMakeLists.txt @@ -79,8 +79,8 @@ target_sources(${PROJECT_NAME} ${HDRS} ) -if (HYPRE_USING_CUDA) - set(CUDA_SRCS +if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL) + set(GPU_SRCS fac_amr_fcoarsen.c fac_amr_rap.c fac_restrict2.c @@ -88,8 +88,8 @@ if (HYPRE_USING_CUDA) fac_zero_stencilcoef.c node_relax.c ) - convert_filenames_to_full_paths(CUDA_SRCS) - set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE) + convert_filenames_to_full_paths(GPU_SRCS) + set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE) endif () convert_filenames_to_full_paths(HDRS) diff --git a/src/sstruct_mv/CMakeLists.txt b/src/sstruct_mv/CMakeLists.txt index 8aeda925f9..013ffb6262 100644 --- a/src/sstruct_mv/CMakeLists.txt +++ b/src/sstruct_mv/CMakeLists.txt @@ -36,13 +36,13 @@ target_sources(${PROJECT_NAME} ${HDRS} ) -if (HYPRE_USING_CUDA) - set(CUDA_SRCS +if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL) + set(GPU_SRCS sstruct_matrix.c sstruct_vector.c ) - convert_filenames_to_full_paths(CUDA_SRCS) - set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE) + convert_filenames_to_full_paths(GPU_SRCS) + set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE) endif () convert_filenames_to_full_paths(HDRS) diff --git a/src/struct_ls/CMakeLists.txt b/src/struct_ls/CMakeLists.txt index 11d51c9eed..4c08db3a63 100644 --- a/src/struct_ls/CMakeLists.txt +++ b/src/struct_ls/CMakeLists.txt @@ -79,8 +79,8 @@ target_sources(${PROJECT_NAME} ${HDRS} ) -if (HYPRE_USING_CUDA) - set(CUDA_SRCS +if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL) + set(GPU_SRCS cyclic_reduction.c HYPRE_struct_int.c HYPRE_struct_pcg.c @@ -108,8 +108,8 @@ if (HYPRE_USING_CUDA) sparse_msg_interp.c sparse_msg_restrict.c ) - convert_filenames_to_full_paths(CUDA_SRCS) - set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE) + convert_filenames_to_full_paths(GPU_SRCS) + set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE) endif () convert_filenames_to_full_paths(HDRS) diff --git a/src/struct_mv/CMakeLists.txt b/src/struct_mv/CMakeLists.txt index b77c886313..8cc286f522 100644 --- a/src/struct_mv/CMakeLists.txt +++ b/src/struct_mv/CMakeLists.txt @@ -44,8 +44,8 @@ target_sources(${PROJECT_NAME} ${HDRS} ) -if (HYPRE_USING_CUDA) - set(CUDA_SRCS +if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL) + set(GPU_SRCS struct_axpy.c struct_communication.c struct_copy.c @@ -55,8 +55,8 @@ if (HYPRE_USING_CUDA) struct_scale.c struct_vector.c ) - convert_filenames_to_full_paths(CUDA_SRCS) - set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE) + convert_filenames_to_full_paths(GPU_SRCS) + set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE) endif () convert_filenames_to_full_paths(HDRS) diff --git a/src/utilities/CMakeLists.txt b/src/utilities/CMakeLists.txt index ef0f2923d4..3fbd1eefb6 100644 --- a/src/utilities/CMakeLists.txt +++ b/src/utilities/CMakeLists.txt @@ -51,8 +51,8 @@ target_sources(${PROJECT_NAME} ${HDRS} ) -if (HYPRE_USING_CUDA) - set(CUDA_SRCS +if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL) + set(GPU_SRCS HYPRE_handle.c device_utils.c handle.c @@ -62,8 +62,8 @@ if (HYPRE_USING_CUDA) omp_device.c nvtx.c ) - convert_filenames_to_full_paths(CUDA_SRCS) - set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE) + convert_filenames_to_full_paths(GPU_SRCS) + set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE) endif () convert_filenames_to_full_paths(HDRS) From ec8c5de3f50432fc9f8253059e14eae9a3519021 Mon Sep 17 00:00:00 2001 From: Wayne Mitchell Date: Thu, 4 Nov 2021 22:54:56 +0000 Subject: [PATCH 31/44] Some code cleanup --- src/CMakeLists.txt | 2 - src/seq_mv/csr_matvec_device.c | 5 +- src/struct_mv/_hypre_struct_mv.hpp | 4 - src/struct_mv/boxloop_sycl.h | 4 - src/utilities/_hypre_utilities.hpp | 132 +---------------------------- src/utilities/device_utils.c | 1 - src/utilities/device_utils.h | 132 +---------------------------- 7 files changed, 5 insertions(+), 275 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 94a92e72b1..aa545dfd1a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -297,8 +297,6 @@ if (HYPRE_WITH_SYCL) # Set CXX compiler to dpcpp set(CMAKE_CXX_COMPILER "dpcpp") - # WM: try with/without the line below - # set(CMAKE_LINKER "dpcpp") # Add any extra CXX compiler flags HYPRE_WITH_EXTRA_CXXFLAGS if (NOT HYPRE_WITH_EXTRA_CXXFLAGS STREQUAL "") diff --git a/src/seq_mv/csr_matvec_device.c b/src/seq_mv/csr_matvec_device.c index a6a09363e6..d36981c768 100644 --- a/src/seq_mv/csr_matvec_device.c +++ b/src/seq_mv/csr_matvec_device.c @@ -52,9 +52,10 @@ hypre_CSRMatrixMatvecDevice2( HYPRE_Int trans, hypre_CSRMatrixMatvecRocsparse(trans, alpha, A, x, beta, y, offset); #elif defined(HYPRE_USING_ONEMKLSPARSE) hypre_CSRMatrixMatvecOnemklsparse(trans, alpha, A, x, beta, y, offset); +// WM: TODO: remove trivial HYPRE_USING_SYCL branch after onemlksparse implementation is in +#elif defined(HYPRE_USING_SYCL) #else // #ifdef HYPRE_USING_CUSPARSE -// WM: TODO: commenting this out for now, but put it back after sycl impelentation is done -/* #error HYPRE SPMV TODO */ +#error HYPRE SPMV TODO #endif return hypre_error_flag; diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index d76cab3557..5224d03cab 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -1481,8 +1481,6 @@ else \ /* Reduction BoxLoop1 */ -/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */ -/* Right now, it is hardcoded as a HYPRE_Real */ #define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var) \ { \ hypre_newBoxLoopInit(ndim, loop_size); \ @@ -1504,8 +1502,6 @@ else \ } /* Reduction BoxLoop2 */ -/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */ -/* Right now, it is hardcoded as a HYPRE_Real */ #define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ dbox2, start2, stride2, i2, sum_var) \ { \ diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h index b8a61a07ea..db076f049b 100644 --- a/src/struct_mv/boxloop_sycl.h +++ b/src/struct_mv/boxloop_sycl.h @@ -329,8 +329,6 @@ else \ /* Reduction BoxLoop1 */ -/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */ -/* Right now, it is hardcoded as a HYPRE_Real */ #define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var) \ { \ hypre_newBoxLoopInit(ndim, loop_size); \ @@ -352,8 +350,6 @@ else \ } /* Reduction BoxLoop2 */ -/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */ -/* Right now, it is hardcoded as a HYPRE_Real */ #define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \ dbox2, start2, stride2, i2, sum_var) \ { \ diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 7914100efe..1ba80c8732 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -110,23 +110,8 @@ struct hypre_device_allocator #elif defined(HYPRE_USING_SYCL) -/* WM: todo - if the include for CL/sycl.hpp is inside extern "C++" {}, I get problems with sycl reductions... totally strange, but true */ +/* WM: problems with this being inside extern C++ {} */ /* #include */ -/* WM: todo - include below as necessary */ -/* #include */ -/* #include */ -/* #include */ -/* #include */ - -/* #include // dpct::remove_if, remove_copy_if, copy_if */ - -/* #include */ -/* #include */ -/* #include */ -/* #include */ - -/* #include */ -/* #include */ #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) @@ -285,7 +270,6 @@ struct hypre_DeviceData #endif #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - /* WM: question - what is the device_allocator? */ hypre_device_allocator device_allocator; #endif #if defined(HYPRE_USING_SYCL) @@ -410,119 +394,7 @@ struct hypre_GpuMatData #endif //#if defined(HYPRE_USING_GPU) -/* WM: todo - is this how I want to integrate the functionality below? Do I really need all this? */ -/* NOTE: It doesn't line up that nicely with the cuda/hip implementation since you need to pass item agrs */ #if defined(HYPRE_USING_SYCL) -/* return the number of work-items in current work-group */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_num_threads(sycl::nd_item& item) -{ - return item.get_group().get_local_linear_range(); -} - -/* return the flattened or linearlized work-item id in current work-group (not global)*/ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_thread_id(sycl::nd_item& item) -{ - return item.get_local_linear_id(); -} - -/* return the number of sub-groups in current work-group */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_num_warps(sycl::nd_item& item) -{ - return item.get_sub_group().get_group_range().get(0); -} - -/* return the sub_group id in work-group */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_warp_id(sycl::nd_item& item) -{ - return item.get_sub_group().get_group_linear_id(); -} - -/* return the work-item lane id in a sub_group */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_lane_id(sycl::nd_item& item) -{ - return hypre_cuda_get_thread_id(item) & (item.get_sub_group().get_local_range().get(0)-1); -} - -/* return the num of work_groups in nd_range */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_num_blocks(sycl::nd_item& item) -{ - // return item.get_group().get_group_linear_range(); // API available in SYCL 2020 - - switch (dim) - { - case 1: - return (item.get_group_range(0)); - case 2: - return (item.get_group_range(0) * item.get_group_range(1)); - case 3: - return (item.get_group_range(0) * item.get_group_range(1) * item.get_group_range(2)); - } - - return -1; -} - -/* return the flattened or linearlized work-group id in nd_range */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_block_id(sycl::nd_item& item) -{ - return item.get_group_linear_id(); -} - -/* return the number of work-items in global iteration space*/ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_grid_num_threads(sycl::nd_item& item) -{ - switch (dim) - { - case 1: - return (item.get_global_range(0)); - case 2: - return (item.get_global_range(0) * item.get_global_range(1)); - case 3: - return (item.get_global_range(0) * item.get_global_range(1) * item.get_global_range(2)); - } - - return -1; -} - -/* return the flattened work-item id in global iteration space */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_grid_thread_id(sycl::nd_item& item) -{ - return item.get_global_linear_id(); -} - -/* return the number of sub-groups in global iteration space */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_grid_num_warps(sycl::nd_item& item) -{ - return hypre_cuda_get_num_blocks(item) * hypre_cuda_get_num_warps(item); -} - -/* return the flattened sub-group id in global iteration space */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_grid_warp_id(sycl::nd_item& item) -{ - return hypre_cuda_get_block_id(item) * hypre_cuda_get_num_warps(item) + - hypre_cuda_get_warp_id(item); -} /* device_utils.c */ sycl::range<1> hypre_GetDefaultDeviceBlockDimension(); @@ -572,8 +444,6 @@ using namespace thrust::placeholders; #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } #elif defined(HYPRE_USING_HIP) #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } -#elif defined(HYPRE_USING_SYCL) -/* WM: todo? used below in HYPRE_CUDA_LAUNCH2 */ #endif #else // #if defined(HYPRE_DEBUG) #define GPU_LAUNCH_SYNC diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index 72a30c73be..5747fca82e 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -1228,7 +1228,6 @@ hypre_DeviceDataCreate() hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST); #if defined(HYPRE_USING_SYCL) - /* WM: does the default selector get a GPU if available? Having trouble with getting the device on frank, so temporarily just passing the default selector */ hypre_DeviceDataDevice(data) = sycl::device(sycl::default_selector{}); hypre_DeviceDataDeviceMaxWorkGroupSize(data) = hypre_DeviceDataDevice(data).get_info(); #else diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 1dc3e0f0ff..2350c8c0e6 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -53,23 +53,8 @@ #elif defined(HYPRE_USING_SYCL) -/* WM: todo - if the include for CL/sycl.hpp is inside extern "C++" {}, I get problems with sycl reductions... totally strange, but true */ +/* WM: problems with this being inside extern C++ {} */ /* #include */ -/* WM: todo - include below as necessary */ -/* #include */ -/* #include */ -/* #include */ -/* #include */ - -/* #include // dpct::remove_if, remove_copy_if, copy_if */ - -/* #include */ -/* #include */ -/* #include */ -/* #include */ - -/* #include */ -/* #include */ #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) @@ -228,7 +213,6 @@ struct hypre_DeviceData #endif #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - /* WM: question - what is the device_allocator? */ hypre_device_allocator device_allocator; #endif #if defined(HYPRE_USING_SYCL) @@ -353,119 +337,7 @@ struct hypre_GpuMatData #endif //#if defined(HYPRE_USING_GPU) -/* WM: todo - is this how I want to integrate the functionality below? Do I really need all this? */ -/* NOTE: It doesn't line up that nicely with the cuda/hip implementation since you need to pass item agrs */ #if defined(HYPRE_USING_SYCL) -/* return the number of work-items in current work-group */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_num_threads(sycl::nd_item& item) -{ - return item.get_group().get_local_linear_range(); -} - -/* return the flattened or linearlized work-item id in current work-group (not global)*/ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_thread_id(sycl::nd_item& item) -{ - return item.get_local_linear_id(); -} - -/* return the number of sub-groups in current work-group */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_num_warps(sycl::nd_item& item) -{ - return item.get_sub_group().get_group_range().get(0); -} - -/* return the sub_group id in work-group */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_warp_id(sycl::nd_item& item) -{ - return item.get_sub_group().get_group_linear_id(); -} - -/* return the work-item lane id in a sub_group */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_lane_id(sycl::nd_item& item) -{ - return hypre_cuda_get_thread_id(item) & (item.get_sub_group().get_local_range().get(0)-1); -} - -/* return the num of work_groups in nd_range */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_num_blocks(sycl::nd_item& item) -{ - // return item.get_group().get_group_linear_range(); // API available in SYCL 2020 - - switch (dim) - { - case 1: - return (item.get_group_range(0)); - case 2: - return (item.get_group_range(0) * item.get_group_range(1)); - case 3: - return (item.get_group_range(0) * item.get_group_range(1) * item.get_group_range(2)); - } - - return -1; -} - -/* return the flattened or linearlized work-group id in nd_range */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_block_id(sycl::nd_item& item) -{ - return item.get_group_linear_id(); -} - -/* return the number of work-items in global iteration space*/ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_grid_num_threads(sycl::nd_item& item) -{ - switch (dim) - { - case 1: - return (item.get_global_range(0)); - case 2: - return (item.get_global_range(0) * item.get_global_range(1)); - case 3: - return (item.get_global_range(0) * item.get_global_range(1) * item.get_global_range(2)); - } - - return -1; -} - -/* return the flattened work-item id in global iteration space */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_grid_thread_id(sycl::nd_item& item) -{ - return item.get_global_linear_id(); -} - -/* return the number of sub-groups in global iteration space */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_grid_num_warps(sycl::nd_item& item) -{ - return hypre_cuda_get_num_blocks(item) * hypre_cuda_get_num_warps(item); -} - -/* return the flattened sub-group id in global iteration space */ -template -static __inline__ __attribute__((always_inline)) -hypre_int hypre_cuda_get_grid_warp_id(sycl::nd_item& item) -{ - return hypre_cuda_get_block_id(item) * hypre_cuda_get_num_warps(item) + - hypre_cuda_get_warp_id(item); -} /* device_utils.c */ sycl::range<1> hypre_GetDefaultDeviceBlockDimension(); @@ -515,8 +387,6 @@ using namespace thrust::placeholders; #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } #elif defined(HYPRE_USING_HIP) #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } -#elif defined(HYPRE_USING_SYCL) -/* WM: todo? used below in HYPRE_CUDA_LAUNCH2 */ #endif #else // #if defined(HYPRE_DEBUG) #define GPU_LAUNCH_SYNC From 3254e3187c5cf9ece1c430cdca925716a55efa25 Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com> Date: Thu, 4 Nov 2021 19:49:09 -0500 Subject: [PATCH 32/44] [SYCL] convert sycl::device to sycl::device* for better handling (#504) * [SYCL] convert sycl::device to sycl::device* for better handling and setting * [SYCL] fix ONEAPI warning and build issues * [SYCL] fix the setDevice method and address comments --- src/utilities/_hypre_utilities.hpp | 2 +- src/utilities/device_utils.c | 15 ++++--- src/utilities/device_utils.h | 2 +- src/utilities/general.c | 64 ++++++++++++++++++++---------- 4 files changed, 54 insertions(+), 29 deletions(-) diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 1ba80c8732..b8addbad0b 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -273,7 +273,7 @@ struct hypre_DeviceData hypre_device_allocator device_allocator; #endif #if defined(HYPRE_USING_SYCL) - sycl::device device; + sycl::device* device; HYPRE_Int device_max_work_group_size; #else HYPRE_Int device; diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index 5747fca82e..b1bb63252b 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -964,14 +964,14 @@ hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i) catch (sycl::exception const& ex) { std::cout << "Caught asynchronous SYCL exception:" << std::endl - << ex.what() << ", OpenCL code: " << ex.get_cl_code() << std::endl; + << ex.what() << ", SYCL code: " << ex.code() << std::endl; } } }; - sycl::device syclDev = data->device; - sycl::context syclctxt = sycl::context(syclDev, sycl_asynchandler); - stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); + sycl::device* syclDev = data->device; + sycl::context syclctxt = sycl::context(*syclDev, sycl_asynchandler); + stream = new sycl::queue(syclctxt, *syclDev, sycl::property_list{sycl::property::queue::in_order{}}); data->streams[i] = stream; } #endif @@ -1019,7 +1019,7 @@ sycl::queue* hypre_DeviceDataComputeStream(hypre_DeviceData *data) { return hypre_DeviceDataStream(data, - hypre_DeviceDataComputeStreamNum(data)); + hypre_DeviceDataComputeStreamNum(data)); } #if defined(HYPRE_USING_CURAND) @@ -1228,7 +1228,9 @@ hypre_DeviceDataCreate() hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST); #if defined(HYPRE_USING_SYCL) - hypre_DeviceDataDevice(data) = sycl::device(sycl::default_selector{}); + /* WM: does the default selector get a GPU if available? Having trouble with getting the device on frank, so temporarily just passing the default selector */ + hypre_DeviceDataDevice(data) = nullptr; + hypre_DeviceDataDeviceMaxWorkGroupSize(data) = hypre_DeviceDataDevice(data).get_info(); #else hypre_DeviceDataDevice(data) = 0; @@ -1486,6 +1488,7 @@ hypre_bind_device( HYPRE_Int myid, /* get number of devices on this node */ hypre_GetDeviceCount(&nDevices); + /* TODO: ABB might need to look into this since nDevices are overwritten by 1 */ nDevices = 1; /* set device */ diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 2350c8c0e6..e4e137ca14 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -216,7 +216,7 @@ struct hypre_DeviceData hypre_device_allocator device_allocator; #endif #if defined(HYPRE_USING_SYCL) - sycl::device device; + sycl::device* device; HYPRE_Int device_max_work_group_size; #else HYPRE_Int device; diff --git a/src/utilities/general.c b/src/utilities/general.c index 8ec1e818e1..0aed7d5252 100644 --- a/src/utilities/general.c +++ b/src/utilities/general.c @@ -100,8 +100,35 @@ hypre_SetDevice(hypre_int device_id, hypre_Handle *hypre_handle_) #endif #if defined(HYPRE_USING_SYCL) - /* sycl device set at construction of hypre_DeviceData object */ -#elif defined(HYPRE_USING_GPU) + HYPRE_Int nDevices=0; + hypre_GetDeviceCount(&nDevices); + if (device_id > nDevices) { + hypre_printf("ERROR: SYCL device-ID exceed the number of devices on-node... \n"); + } + + HYPRE_Int local_nDevices=0; + for (int i = 0; i < gpu_devices.size(); i++) { + // multi-tile GPUs + if (gpu_devices[i].get_info() > 0) { + auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices(sycl::info::partition_affinity_domain::numa); + for (auto &tile : subDevicesDomainNuma) { + if (local_nDevices == device_id) { + hypre_HandleDevice(hypre_handle_) = &tile; + } + local_nDevices++; + } + } + // single-tile GPUs + else { + if (local_nDevices == device_id) { + hypre_HandleDevice(hypre_handle_) = &(gpu_devices[i]); + } + local_nDevices++; + } + } +#endif + +#if defined(HYPRE_USING_GPU) && !defined(HYPRE_USING_SYCL) if (hypre_handle_) { hypre_HandleDevice(hypre_handle_) = device_id; @@ -152,25 +179,20 @@ hypre_GetDeviceCount(hypre_int *device_count) #endif #if defined(HYPRE_USING_SYCL) - /* WM: todo - doesn't work on frank... commenting out */ - /* sycl::platform platform(sycl::gpu_selector{}); */ - /* auto const& gpu_devices = platform.get_devices(); */ - /* for (int i = 0; i < gpu_devices.size(); i++) */ - /* { */ - /* if (gpu_devices[i].is_gpu()) */ - /* { */ - /* if(gpu_devices[i].get_info() > 0) */ - /* { */ - /* auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices( */ - /* sycl::info::partition_affinity_domain::numa); */ - /* (*device_count) += subDevicesDomainNuma.size(); */ - /* } */ - /* else */ - /* { */ - /* (*device_count)++; */ - /* } */ - /* } */ - /* } */ + sycl::platform platform(sycl::gpu_selector{}); + auto const& gpu_devices = platform.get_devices(sycl::info::device_type::gpu); + for (int i = 0; i < gpu_devices.size(); i++) + { + if(gpu_devices[i].get_info() > 0) + { + auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices(sycl::info::partition_affinity_domain::numa); + (*device_count) += subDevicesDomainNuma.size(); + } + else + { + (*device_count)++; + } + } #endif return hypre_error_flag; From 68fc8be8dde7d244044ca5586dc6182252f744ca Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Tue, 7 Dec 2021 23:32:32 +0000 Subject: [PATCH 33/44] [SYCL] add complex types for device --- src/utilities/HYPRE_utilities.h | 10 ++++++++- src/utilities/complex.c | 36 ++++++++++++++++++++++++++------- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/src/utilities/HYPRE_utilities.h b/src/utilities/HYPRE_utilities.h index 106e70ae5f..57b3dc11ec 100644 --- a/src/utilities/HYPRE_utilities.h +++ b/src/utilities/HYPRE_utilities.h @@ -83,7 +83,15 @@ typedef double HYPRE_Real; #endif #if defined(HYPRE_COMPLEX) -typedef double _Complex HYPRE_Complex; + +#if defined(HYPRE_USING_SYCL) + typedef std::complex HYPRE_Complex; +#elif defined(HYPRE_USING_GPU) + typedef thrust::complex HYPRE_Complex; +#else + typedef double _Complex HYPRE_Complex; +#endif + #define HYPRE_MPI_COMPLEX MPI_C_DOUBLE_COMPLEX /* or MPI_LONG_DOUBLE ? */ #else /* default */ diff --git a/src/utilities/complex.c b/src/utilities/complex.c index eb8dca4f38..ba04d01577 100644 --- a/src/utilities/complex.c +++ b/src/utilities/complex.c @@ -9,30 +9,52 @@ #ifdef HYPRE_COMPLEX -#include - HYPRE_Complex hypre_conj( HYPRE_Complex value ) { - return conj(value); +#ifdef HYPRE_USING_SYCL + return std::conj(value); +#elif defined(HYPRE_USING_GPU) + return thrust::conj(value); +#else + return conj(value); +#endif } HYPRE_Real hypre_cabs( HYPRE_Complex value ) { - return cabs(value); +#ifdef HYPRE_USING_SYCL + return std::abs(value); +#elif defined(HYPRE_USING_GPU) + return thrust::abs(value); +#else + return cabs(value); +#endif } HYPRE_Real hypre_creal( HYPRE_Complex value ) { - return creal(value); +#ifdef HYPRE_USING_SYCL + return std::real(value); +#elif defined(HYPRE_USING_GPU) + return thrust::real(value); +#else + return creal(value); +#endif } HYPRE_Real hypre_cimag( HYPRE_Complex value ) { - return cimag(value); +#ifdef HYPRE_USING_SYCL + return std::imag(value); +#elif defined(HYPRE_USING_GPU) + return thrust::imag(value); +#else + return cimag(value); +#endif } -#endif +#endif // HYPRE_COMPLEX From b7ebf4eddf5643440616a8090bd4d9f98748c079 Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Wed, 8 Dec 2021 05:24:16 +0000 Subject: [PATCH 34/44] [SYCL] kernel launch macro --- src/IJ_mv/IJMatrix_parcsr_device.c | 4 +- src/IJ_mv/IJVector_parcsr_device.c | 2 +- src/parcsr_ls/ads.c | 12 +- src/parcsr_ls/ame.c | 2 +- src/parcsr_ls/ams.c | 40 +- src/parcsr_ls/par_2s_interp_device.c | 10 +- src/parcsr_ls/par_coarsen_device.c | 4 +- src/parcsr_ls/par_gauss_elim.c | 2 +- src/parcsr_ls/par_indepset_device.c | 4 +- src/parcsr_ls/par_interp_device.c | 8 +- src/parcsr_ls/par_interp_trunc_device.c | 2 +- src/parcsr_ls/par_lr_interp_device.c | 14 +- src/parcsr_ls/par_lr_restr_device.c | 2 +- src/parcsr_ls/par_mod_multi_interp_device.c | 18 +- src/parcsr_ls/par_relax_more_device.c | 2 +- src/parcsr_ls/par_strength_device.c | 4 +- src/parcsr_mv/par_csr_matop_device.c | 12 +- src/seq_mv/csr_matop_device.c | 2704 ++++++++++++++----- src/seq_mv/csr_matrix.c | 2 +- src/seq_mv/csr_spgemm_device_attempt.c | 6 +- src/seq_mv/csr_spgemm_device_confident.c | 4 +- src/seq_mv/csr_spgemm_device_rowbound.c | 6 +- src/seq_mv/csr_spgemm_device_rowest.c | 12 +- src/seq_mv/csr_spgemm_device_util.c | 4 +- src/seq_mv/csr_spmv_device.c | 10 +- src/struct_mv/_hypre_struct_mv.hpp | 4 +- src/struct_mv/boxloop_cuda.h | 4 +- src/utilities/_hypre_utilities.hpp | 457 +++- src/utilities/device_reducer.h | 2 +- src/utilities/device_utils.c | 24 +- src/utilities/device_utils.h | 455 +++- src/utilities/general.c | 2 + 32 files changed, 3053 insertions(+), 785 deletions(-) diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index 1760f3f0db..afedb24f5f 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -155,7 +155,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, /* mark unwanted elements as -1 */ dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(len1, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator, (HYPRE_Int *) row_indexes, ncols, indicator ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator, (HYPRE_Int *) row_indexes, ncols, indicator ); auto new_end = HYPRE_THRUST_CALL( copy_if, @@ -218,7 +218,7 @@ hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_Big /* dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(N0, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 ); */ /* output X: 0: keep, 1: zero-out */ diff --git a/src/IJ_mv/IJVector_parcsr_device.c b/src/IJ_mv/IJVector_parcsr_device.c index b9afa8c67b..34cb5e8bd0 100644 --- a/src/IJ_mv/IJVector_parcsr_device.c +++ b/src/IJ_mv/IJVector_parcsr_device.c @@ -233,7 +233,7 @@ hypre_IJVectorAssembleParDevice(hypre_IJVector *vector) /* set/add to local vector */ dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(new_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i, vec_start, new_sora, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i, vec_start, new_sora, hypre_VectorData(hypre_ParVectorLocalVector(par_vector)) ); hypre_TFree(new_i, HYPRE_MEMORY_DEVICE); diff --git a/src/parcsr_ls/ads.c b/src/parcsr_ls/ads.c index e8e87b9047..c6cdf716e1 100644 --- a/src/parcsr_ls/ads.c +++ b/src/parcsr_ls/ads.c @@ -576,12 +576,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, F2V_diag_nnz, 3, F2V_diag_J, Pi_diag_J ); gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data, Pi_diag_data ); } @@ -638,12 +638,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, F2V_offd_nnz, 3, F2V_offd_J, Pi_offd_J ); gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data, Pi_offd_data ); } @@ -846,7 +846,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data, Pix_diag_data, Piy_diag_data, Piz_diag_data ); } @@ -926,7 +926,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data, Pix_offd_data, Piy_offd_data, Piz_offd_data ); } diff --git a/src/parcsr_ls/ame.c b/src/parcsr_ls/ame.c index eea0c6f9ae..e23f025e82 100644 --- a/src/parcsr_ls/ame.c +++ b/src/parcsr_ls/ame.c @@ -467,7 +467,7 @@ HYPRE_Int hypre_AMESetup(void *esolver) { dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim, nv, GtdI, GtdJ, GtdA, GtoI, GtoJ, GtoA, edge_bc, offd_edge_bc ); } else diff --git a/src/parcsr_ls/ams.c b/src/parcsr_ls/ams.c index 01fe07450d..470879835b 100644 --- a/src/parcsr_ls/ams.c +++ b/src/parcsr_ls/ams.c @@ -192,7 +192,7 @@ HYPRE_Int hypre_ParVectorBlockSplit(hypre_ParVector *x, { dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim, size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data); } else @@ -235,7 +235,7 @@ HYPRE_Int hypre_ParVectorBlockGather(hypre_ParVector *x, { dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim, size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data); } else @@ -436,7 +436,7 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A) bDim = hypre_GetDefaultDeviceBlockDimension(); gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim, + HYPRE_GPU_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim, nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd); //hypre_SyncCudaComputeStream(hypre_handle()); @@ -763,7 +763,7 @@ HYPRE_Int hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix *A, HYPRE_Real d) { dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim, num_rows, A_diag_I, A_diag_J, A_diag_data, A_offd_I, num_cols_offd, d); } else @@ -1539,12 +1539,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, G_diag_nnz, dim, G_diag_J, Pi_diag_J ); gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data, Pi_diag_data ); } @@ -1604,12 +1604,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, G_offd_nnz, dim, G_offd_J, Pi_offd_J ); gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data, Pi_offd_data ); } @@ -1838,7 +1838,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data, Pix_diag_data, Piy_diag_data, Piz_diag_data ); } @@ -1904,7 +1904,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, NULL, Pix_diag_data, Piy_diag_data, NULL ); } @@ -1962,7 +1962,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, NULL, NULL, Pix_diag_data, NULL, NULL ); } @@ -2039,7 +2039,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data, Pix_offd_data, Piy_offd_data, Piz_offd_data ); } @@ -2121,7 +2121,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, NULL, Pix_offd_data, Piy_offd_data, NULL ); } @@ -2193,7 +2193,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, NULL, NULL, Pix_offd_data, NULL, NULL ); } @@ -2385,12 +2385,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, G_diag_nnz, dim, G_diag_J, GPi_diag_J ); gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data, GPi_diag_data ); } @@ -2451,12 +2451,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, G_offd_nnz, dim, G_offd_J, GPi_offd_J ); gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data, GPi_offd_data ); } @@ -2681,7 +2681,7 @@ HYPRE_Int hypre_AMSSetup(void *solver, { dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim, nv, G0tdI, G0tdA, G0toI, G0toA, interior_nodes_data ); } else @@ -3246,7 +3246,7 @@ HYPRE_Int hypre_AMSSetup(void *solver, { dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(Gt_num_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim, Gt_num_rows, Gt_diag_I, Gt_diag_J, Gt_diag_data, Gt_offd_I, Gt_offd_data, Gx_data, Gy_data, Gz_data ); } diff --git a/src/parcsr_ls/par_2s_interp_device.c b/src/parcsr_ls/par_2s_interp_device.c index 15a497a04b..7e602d19ca 100644 --- a/src/parcsr_ls/par_2s_interp_device.c +++ b/src/parcsr_ls/par_2s_interp_device.c @@ -93,7 +93,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix *A, dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim); /* only for rows corresponding to F2 (notice flag == -1) */ - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, A_nr_local, A_offd_nnz > 0, @@ -144,7 +144,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix *A, * diagnoally scale As_F2F (from both sides) and replace the diagonal */ gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF, gDim, bDim, AF2F_nr_local, hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)), @@ -312,7 +312,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A, dlam = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE); dtmp = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp, gDim, bDim, AFC_nr_local, hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_FF)), @@ -367,7 +367,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A, gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim); /* only for rows corresponding to F2 (notice flag == -1) */ - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, A_nr_local, A_offd_nnz > 0, @@ -417,7 +417,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A, * diagnoally scale As_F2F (from both sides) and replace the diagonal */ gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF, gDim, bDim, AF2F_nr_local, hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)), diff --git a/src/parcsr_ls/par_coarsen_device.c b/src/parcsr_ls/par_coarsen_device.c index 6c30741003..cadf8d13d0 100644 --- a/src/parcsr_ls/par_coarsen_device.c +++ b/src/parcsr_ls/par_coarsen_device.c @@ -324,7 +324,7 @@ hypre_PMISCoarseningInitDevice( hypre_ParCSRMatrix *S, /* in */ HYPRE_Int *new_end; /* init CF_marker_diag and measure_diag: remove some special nodes */ - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim, num_rows_diag, CF_init, S_diag_i, S_offd_i, measure_diag, CF_marker_diag ); /* communicate for measure_offd */ @@ -487,7 +487,7 @@ hypre_PMISCoarseningUpdateCFDevice( hypre_ParCSRMatrix *S, /* in bDim = hypre_GetDefaultDeviceBlockDimension(); gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF, gDim, bDim, graph_diag_size, graph_diag, diff --git a/src/parcsr_ls/par_gauss_elim.c b/src/parcsr_ls/par_gauss_elim.c index 2a8c9f6189..d3612b6a69 100644 --- a/src/parcsr_ls/par_gauss_elim.c +++ b/src/parcsr_ls/par_gauss_elim.c @@ -420,7 +420,7 @@ HYPRE_Int hypre_dgemv_device(HYPRE_Int m, HYPRE_Int n, HYPRE_Int lda, HYPRE_Real dim3 bDim(BLOCK_SIZE, 1, 1); dim3 gDim = hypre_GetDefaultDeviceGridDimension(m, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y ); return hypre_error_flag; } diff --git a/src/parcsr_ls/par_indepset_device.c b/src/parcsr_ls/par_indepset_device.c index bfebafebc1..6ddea58cd2 100644 --- a/src/parcsr_ls/par_indepset_device.c +++ b/src/parcsr_ls/par_indepset_device.c @@ -170,7 +170,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix *S, bDim = hypre_GetDefaultDeviceBlockDimension(); gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim, graph_diag_size, graph_diag, measure_diag, measure_offd, S_diag_i, S_diag_j, S_offd_i, S_offd_j, IS_marker_diag, IS_marker_offd, IS_offd_temp_mark ); @@ -186,7 +186,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix *S, /* adjust IS_marker_diag from the received */ gDim = hypre_GetDefaultDeviceGridDimension(num_elmts_send, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim, IS_marker_diag, num_elmts_send, send_map_elmts, int_send_buf, IS_offd_temp_mark ); diff --git a/src/parcsr_ls/par_interp_device.c b/src/parcsr_ls/par_interp_device.c index 8a2d4dc0cd..bd410cce36 100644 --- a/src/parcsr_ls/par_interp_device.c +++ b/src/parcsr_ls/par_interp_device.c @@ -178,7 +178,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim, n_fine, S_diag_i, S_diag_j, S_offd_i, S_offd_j, CF_marker, CF_marker_offd, num_functions, dof_func_dev, dof_func_offd, P_diag_i, P_offd_i); @@ -209,7 +209,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A, if (interp_type == 3) { - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim, n_fine, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_j, A_offd_data, hypre_ParCSRMatrixSocDiagJ(S), @@ -222,7 +222,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A, } else { - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim, n_fine, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_j, A_offd_data, hypre_ParCSRMatrixSocDiagJ(S), @@ -1127,7 +1127,7 @@ hypre_BoomerAMGBuildInterpOnePntDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim, n_fine, A_diag_i, A_strong_diag_j, A_diag_a, A_offd_i, A_strong_offd_j, A_offd_a, CF_marker, CF_marker_offd, diag_compress_marker, offd_compress_marker, P_diag_i, P_diag_j_temp, P_offd_i, P_offd_j_temp); diff --git a/src/parcsr_ls/par_interp_trunc_device.c b/src/parcsr_ls/par_interp_trunc_device.c index 4524f91f9e..6b6d7a8d3f 100644 --- a/src/parcsr_ls/par_interp_trunc_device.c +++ b/src/parcsr_ls/par_interp_trunc_device.c @@ -159,7 +159,7 @@ hypre_BoomerAMGInterpTruncationDevice( hypre_ParCSRMatrix *P, HYPRE_Real trunc_f dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim, nrows, trunc_factor, max_elmts, P_rowptr, P_j, P_a ); /* build new P_diag and P_offd */ diff --git a/src/parcsr_ls/par_lr_interp_device.c b/src/parcsr_ls/par_lr_interp_device.c index 43ac592e95..2587a9298e 100644 --- a/src/parcsr_ls/par_lr_interp_device.c +++ b/src/parcsr_ls/par_lr_interp_device.c @@ -69,7 +69,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, A_nr_of_rows, A_offd_nnz > 0, @@ -110,7 +110,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix *A, /* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */ hypre_GpuProfilingPushRange("Compute interp matrix"); gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_aff_afc, gDim, bDim, W_nr_of_rows, hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)), @@ -255,7 +255,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, A_nr_of_rows, A_offd_nnz > 0, @@ -330,7 +330,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix *A, hypre_GpuProfilingPushRange("Compute interp matrix"); gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_twiaff_w, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_twiaff_w, gDim, bDim, W_nr_of_rows, hypre_ParCSRMatrixFirstRowIndex(AFF), @@ -480,7 +480,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, A_nr_of_rows, A_offd_nnz > 0, @@ -523,7 +523,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A, dtmp = hypre_TAlloc(HYPRE_Complex, W_nr_of_rows, HYPRE_MEMORY_DEVICE); hypre_GpuProfilingPushRange("Compute D_tmp"); gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp, gDim, bDim, W_nr_of_rows, hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)), @@ -563,7 +563,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A, /* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */ hypre_GpuProfilingPushRange("Compute interp matrix"); gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe, gDim, bDim, W_nr_of_rows, hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)), diff --git a/src/parcsr_ls/par_lr_restr_device.c b/src/parcsr_ls/par_lr_restr_device.c index 104ec87451..60c2c4894c 100644 --- a/src/parcsr_ls/par_lr_restr_device.c +++ b/src/parcsr_ls/par_lr_restr_device.c @@ -247,7 +247,7 @@ hypre_BoomerAMGBuildRestrNeumannAIRDevice( hypre_ParCSRMatrix *A, /* assemble the diagonal part of R from Z */ dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim, n_cpts, Fmap, Cmap, Z_diag_i, Z_diag_j, Z_diag_a, R_diag_i, R_diag_j, R_diag_a); num_cols_offd_R = num_cols_offd_Z; diff --git a/src/parcsr_ls/par_mod_multi_interp_device.c b/src/parcsr_ls/par_mod_multi_interp_device.c index 3a62f6b813..25cbf2c9b3 100644 --- a/src/parcsr_ls/par_mod_multi_interp_device.c +++ b/src/parcsr_ls/par_mod_multi_interp_device.c @@ -309,7 +309,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A, dim3 gDim = hypre_GetDefaultCUDAGridDimension(remaining, "warp", bDim); /* output diag_shifts is 0/1 indicating if points_left_dev[i] is picked in this pass */ - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_pass_order_count, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_pass_order_count, gDim, bDim, remaining, current_pass, @@ -403,7 +403,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultCUDABlockDimension(); dim3 gDim = hypre_GetDefaultCUDAGridDimension(n_fine, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_cfmarker_masked_rowsum, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_cfmarker_masked_rowsum, gDim, bDim, n_fine, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_j, A_offd_data, CF_marker, @@ -555,7 +555,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultCUDABlockDimension(); dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_points, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_insert_remaining_weights, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_insert_remaining_weights, gDim, bDim, pass_starts[p + 1], pass_starts[p + 2], pass_order, Pi_diag_i, Pi_diag_j, Pi_diag_data, P_diag_i, P_diag_j, P_diag_data, @@ -618,7 +618,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultCUDABlockDimension(); dim3 gDim = hypre_GetDefaultCUDAGridDimension(npoints, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_populate_big_P_offd_j, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_populate_big_P_offd_j, gDim, bDim, pass_starts[p + 1], pass_starts[p + 2], pass_order, @@ -853,7 +853,7 @@ hypre_GenerateMultipassPiDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultCUDABlockDimension(); dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_points, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim, num_points, color, pass_order, pass_marker, pass_marker_offd, S_diag_i, S_diag_j, S_offd_i, S_offd_j, P_diag_i, P_offd_i ); @@ -879,7 +879,7 @@ hypre_GenerateMultipassPiDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultCUDABlockDimension(); dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_points, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_j_Poffd_j, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_j_Poffd_j, gDim, bDim, num_points, color, pass_order, @@ -1101,7 +1101,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultCUDABlockDimension(); dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_points, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim, num_points, color, pass_order, pass_marker, pass_marker_offd, S_diag_i, S_diag_j, S_offd_i, S_offd_j, Q_diag_i, Q_offd_i ); @@ -1128,7 +1128,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultCUDABlockDimension(); dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_points, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Qdiag_j_Qoffd_j, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Qdiag_j_Qoffd_j, gDim, bDim, num_points, color, pass_order, @@ -1199,7 +1199,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultCUDABlockDimension(); dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_points, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_mutli_pi_rowsum, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_mutli_pi_rowsum, gDim, bDim, num_points, pass_order, A_diag_i, A_diag_data, Pi_diag_i, Pi_diag_data, Pi_offd_i, Pi_offd_data, w_row_sum ); diff --git a/src/parcsr_ls/par_relax_more_device.c b/src/parcsr_ls/par_relax_more_device.c index 657905f3d9..f0a994b634 100644 --- a/src/parcsr_ls/par_relax_more_device.c +++ b/src/parcsr_ls/par_relax_more_device.c @@ -153,7 +153,7 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A, bDim = hypre_GetDefaultDeviceBlockDimension(); gDim = hypre_GetDefaultDeviceGridDimension(A_num_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate, + HYPRE_GPU_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate, gDim, bDim, A_num_rows, diff --git a/src/parcsr_ls/par_strength_device.c b/src/parcsr_ls/par_strength_device.c index a2ca43fc8e..a63b8bd2b4 100644 --- a/src/parcsr_ls/par_strength_device.c +++ b/src/parcsr_ls/par_strength_device.c @@ -139,7 +139,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix *A, if (abs_soc) { - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGCreateSabs_rowcount, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGCreateSabs_rowcount, gDim, bDim, num_variables, max_row_sum, strength_threshold, A_diag_data, A_diag_i, A_diag_j, A_offd_data, A_offd_i, A_offd_j, @@ -149,7 +149,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix *A, } else { - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGCreateS_rowcount, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGCreateS_rowcount, gDim, bDim, num_variables, max_row_sum, strength_threshold, A_diag_data, A_diag_i, A_diag_j, A_offd_data, A_offd_i, A_offd_j, diff --git a/src/parcsr_mv/par_csr_matop_device.c b/src/parcsr_mv/par_csr_matop_device.c index 251e28d3a6..67aa26bcc7 100644 --- a/src/parcsr_mv/par_csr_matop_device.c +++ b/src/parcsr_mv/par_csr_matop_device.c @@ -620,7 +620,7 @@ hypre_ConcatDiagAndOffdDevice(hypre_ParCSRMatrix *A) const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); const dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, gDim, bDim, hypre_CSRMatrixNumRows(A_diag), hypre_CSRMatrixNumCols(A_diag), @@ -735,7 +735,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_ParCSRMatrixNumRows(A), "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, gDim, bDim, hypre_CSRMatrixNumRows(A_diag), hypre_CSRMatrixNumCols(A_diag), @@ -765,7 +765,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A, hypre_assert(hypre_CSRMatrixNumCols(E_diag) == hypre_CSRMatrixNumCols(A_diag)); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, gDim, bDim, hypre_CSRMatrixNumRows(E_diag), hypre_CSRMatrixNumCols(E_diag), @@ -1197,21 +1197,21 @@ hypre_ParCSRMatrixDropSmallEntriesDevice( hypre_ParCSRMatrix *A, if (type == -1) { - HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<-1>, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<-1>, gDim, bDim, hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag), hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd), hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd); } if (type == 1) { - HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<1>, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<1>, gDim, bDim, hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag), hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd), hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd); } if (type == 2) { - HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<2>, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<2>, gDim, bDim, hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag), hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd), hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd); diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c index d4201d5a26..e76835cecb 100644 --- a/src/seq_mv/csr_matop_device.c +++ b/src/seq_mv/csr_matop_device.c @@ -109,6 +109,782 @@ hypre_GpuMatDataDestroy(hypre_GpuMatData *data) #endif /* #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) */ +/* ABB: All the compute kernel implementations are grouped here */ +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + +__global__ void +hypreGPUKernel_CSRMoveDiagFirst( HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *aa ) +{ + HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + + if (row >= nrows) + { + return; + } + + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); + HYPRE_Int p = 0, q = 0; + + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); + + for (HYPRE_Int j = p + lane + 1; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) + { + hypre_int find_diag = j < q && ja[j] == row; + + if (find_diag) + { + ja[j] = ja[p]; + ja[p] = row; + HYPRE_Complex tmp = aa[p]; + aa[p] = aa[j]; + aa[j] = tmp; + } + + if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) + { + break; + } + } +} + +/* check if diagonal entry is the first one at each row + * Return: the number of rows that do not have the first entry as diagonal + * RL: only check if it's a non-empty row + */ +__global__ void +hypreGPUKernel_CSRCheckDiagFirst( HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Int *result ) +{ + const HYPRE_Int row = hypre_cuda_get_grid_thread_id<1,1>(); + if (row < nrows) + { + result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row); + } +} + +__global__ void +hypreGPUKernel_CSRMatrixFixZeroDiagDevice( HYPRE_Complex v, + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *data, + HYPRE_Real tol, + HYPRE_Int *result ) +{ + const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + + if (row >= nrows) + { + return; + } + + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); + HYPRE_Int p = 0, q = 0; + bool has_diag = false; + + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); + + for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) + { + hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; + + if (find_diag) + { + if (fabs(data[j]) <= tol) + { + data[j] = v; + } + } + + if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) + { + has_diag = true; + break; + } + } + + if (result && !has_diag && lane == 0) + { + result[row] = 1; + } +} + +__global__ void +hypreGPUKernel_CSRMatrixReplaceDiagDevice( HYPRE_Complex *new_diag, + HYPRE_Complex v, + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *data, + HYPRE_Real tol, + HYPRE_Int *result ) +{ + const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + + if (row >= nrows) + { + return; + } + + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); + HYPRE_Int p = 0, q = 0; + bool has_diag = false; + + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); + + for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) + { + hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; + + if (find_diag) + { + HYPRE_Complex d = read_only_load(&new_diag[row]); + if (fabs(d) <= tol) + { + d = v; + } + data[j] = d; + } + + if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) + { + has_diag = true; + break; + } + } + + if (result && !has_diag && lane == 0) + { + result[row] = 1; + } +} + +/* type == 0, sum, + * 1, abs sum (l-1) + * 2, square sum (l-2) + */ +template +__global__ void +hypreGPUKernel_CSRRowSum( HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *aa, + HYPRE_Int *CF_i, + HYPRE_Int *CF_j, + HYPRE_Complex *row_sum, + HYPRE_Complex scal, + HYPRE_Int set) +{ + HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>(); + + if (row_i >= nrows) + { + return; + } + + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); + HYPRE_Int p = 0, q = 0; + + if (lane < 2) + { + p = read_only_load(ia + row_i + lane); + } + + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); + + HYPRE_Complex row_sum_i = 0.0; + + for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) { + if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) ) + { + continue; + } + + HYPRE_Complex aii = aa[j]; + + if (type == 0) + { + row_sum_i += aii; + } + else if (type == 1) + { + row_sum_i += fabs(aii); + } + else if (type == 2) + { + row_sum_i += aii * aii; + } + } + + row_sum_i = warp_reduce_sum(row_sum_i); + + if (lane == 0) + { + if (set) + { + row_sum[row_i] = scal * row_sum_i; + } + else + { + row_sum[row_i] += scal * row_sum_i; + } + } +} + +/* type 0: diag + * 1: abs diag + * 2: diag inverse + * 3: diag inverse sqrt + * 4: abs diag inverse sqrt + */ +__global__ void +hypreGPUKernel_CSRExtractDiag( HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *aa, + HYPRE_Complex *d, + HYPRE_Int type) +{ + HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + + if (row >= nrows) + { + return; + } + + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); + HYPRE_Int p = 0, q = 0; + + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); + + HYPRE_Int has_diag = 0; + + for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) + { + hypre_int find_diag = j < q && ja[j] == row; + + if (find_diag) + { + if (type == 0) + { + d[row] = aa[j]; + } + else if (type == 1) + { + d[row] = fabs(aa[j]); + } + else if (type == 2) + { + d[row] = 1.0 / aa[j]; + } + else if (type == 3) + { + d[row] = 1.0 / sqrt(aa[j]); + } + else if (type == 4) + { + d[row] = 1.0 / sqrt(fabs(aa[j])); + } + } + + if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) + { + has_diag = 1; + break; + } + } + + if (!has_diag && lane == 0) + { + d[row] = 0.0; + } +} + +/* mark is of size nA + * diag_option: 1: special treatment for diag entries, mark as -2 + */ +__global__ void +hypreGPUKernel_CSRMatrixIntersectPattern(HYPRE_Int n, + HYPRE_Int nA, + HYPRE_Int *rowid, + HYPRE_Int *colid, + HYPRE_Int *idx, + HYPRE_Int *mark, + HYPRE_Int diag_option) +{ + HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>(); + + if (i >= n) + { + return; + } + + HYPRE_Int r1 = read_only_load(&rowid[i]); + HYPRE_Int c1 = read_only_load(&colid[i]); + HYPRE_Int j = read_only_load(&idx[i]); + + if (0 == diag_option) + { + if (j < nA) + { + HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; + HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; + if (r1 == r2 && c1 == c2) + { + mark[j] = c1; + } + else + { + mark[j] = -1; + } + } + } + else if (1 == diag_option) + { + if (j < nA) + { + if (r1 == c1) + { + mark[j] = -2; + } + else + { + HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; + HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; + if (r1 == r2 && c1 == c2) + { + mark[j] = c1; + } + else + { + mark[j] = -1; + } + } + } + } +} + +#elif defined(HYPRE_USING_SYCL) + +void +hypreGPUKernel_CSRMoveDiagFirst( sycl::nd_item<1>& item, + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *aa ) +{ + HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); + + if (row >= nrows) + { + return; + } + + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + HYPRE_Int p = 0, q = 0; + + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } + sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); + + for (HYPRE_Int j = p + lane + 1; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE) + { + hypre_int find_diag = j < q && ja[j] == row; + + if (find_diag) + { + ja[j] = ja[p]; + ja[p] = row; + HYPRE_Complex tmp = aa[p]; + aa[p] = aa[j]; + aa[j] = tmp; + } + + if ( sycl::any_of_group(SG, find_diag) ) + { + break; + } + } +} + +/* check if diagonal entry is the first one at each row + * Return: the number of rows that do not have the first entry as diagonal + * RL: only check if it's a non-empty row + */ +void +hypreGPUKernel_CSRCheckDiagFirst( sycl::nd_item<1>& item, + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Int *result ) +{ + const HYPRE_Int row = hypre_gpu_get_grid_thread_id<1,1>(item); + if (row < nrows) + { + result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row); + } +} + +void +hypreGPUKernel_CSRMatrixFixZeroDiagDevice( sycl::nd_item<1>& item, + HYPRE_Complex v, + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *data, + HYPRE_Real tol, + HYPRE_Int *result ) +{ + const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); + + if (row >= nrows) + { + return; + } + + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + HYPRE_Int p = 0, q = 0; + bool has_diag = false; + + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } + sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); + + for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE) + { + hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; + + if (find_diag) + { + if (fabs(data[j]) <= tol) + { + data[j] = v; + } + } + + if ( sycl::any_of_group(SG, find_diag) ) + { + has_diag = true; + break; + } + } + + if (result && !has_diag && lane == 0) + { + result[row] = 1; + } +} + +void +hypreGPUKernel_CSRMatrixReplaceDiagDevice( sycl::nd_item<1>& item, + HYPRE_Complex *new_diag, + HYPRE_Complex v, + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *data, + HYPRE_Real tol, + HYPRE_Int *result ) +{ + const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); + + if (row >= nrows) + { + return; + } + + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + HYPRE_Int p = 0, q = 0; + bool has_diag = false; + + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } + sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); + + for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE) + { + hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; + + if (find_diag) + { + HYPRE_Complex d = read_only_load(&new_diag[row]); + if (fabs(d) <= tol) + { + d = v; + } + data[j] = d; + } + + if ( sycl::any_of_group(SG, find_diag) ) + { + has_diag = true; + break; + } + } + + if (result && !has_diag && lane == 0) + { + result[row] = 1; + } +} + +/* type == 0, sum, + * 1, abs sum (l-1) + * 2, square sum (l-2) + */ +template +void +hypreGPUKernel_CSRRowSum( sycl::nd_item<1>& item, + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *aa, + HYPRE_Int *CF_i, + HYPRE_Int *CF_j, + HYPRE_Complex *row_sum, + HYPRE_Complex scal, + HYPRE_Int set) +{ + HYPRE_Int row_i = hypre_gpu_get_grid_warp_id<1,1>(item); + + if (row_i >= nrows) + { + return; + } + + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + HYPRE_Int p = 0, q = 0; + + if (lane < 2) + { + p = read_only_load(ia + row_i + lane); + } + + sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); + + HYPRE_Complex row_sum_i = 0.0; + + for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE) { + if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) ) + { + continue; + } + + HYPRE_Complex aii = aa[j]; + + if (type == 0) + { + row_sum_i += aii; + } + else if (type == 1) + { + row_sum_i += fabs(aii); + } + else if (type == 2) + { + row_sum_i += aii * aii; + } + } + + row_sum_i = warp_reduce_sum(row_sum_i, item); + + if (lane == 0) + { + if (set) + { + row_sum[row_i] = scal * row_sum_i; + } + else + { + row_sum[row_i] += scal * row_sum_i; + } + } +} + +/* type 0: diag + * 1: abs diag + * 2: diag inverse + * 3: diag inverse sqrt + * 4: abs diag inverse sqrt + */ +void +hypreGPUKernel_CSRExtractDiag( sycl::nd_item<1>& item, + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *aa, + HYPRE_Complex *d, + HYPRE_Int type) +{ + HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); + + if (row >= nrows) + { + return; + } + + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + HYPRE_Int p = 0, q = 0; + + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } + sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); + + HYPRE_Int has_diag = 0; + + for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE) + { + hypre_int find_diag = j < q && ja[j] == row; + + if (find_diag) + { + if (type == 0) + { + d[row] = aa[j]; + } + else if (type == 1) + { + d[row] = fabs(aa[j]); + } + else if (type == 2) + { + d[row] = 1.0 / aa[j]; + } + else if (type == 3) + { + d[row] = 1.0 / sqrt(aa[j]); + } + else if (type == 4) + { + d[row] = 1.0 / sqrt(fabs(aa[j])); + } + } + + if ( sycl::any_of_group(SG, find_diag) ) + { + has_diag = 1; + break; + } + } + + if (!has_diag && lane == 0) + { + d[row] = 0.0; + } +} + +/* mark is of size nA + * diag_option: 1: special treatment for diag entries, mark as -2 + */ +void +hypreGPUKernel_CSRMatrixIntersectPattern( sycl::nd_item<1>& item, + HYPRE_Int n, + HYPRE_Int nA, + HYPRE_Int *rowid, + HYPRE_Int *colid, + HYPRE_Int *idx, + HYPRE_Int *mark, + HYPRE_Int diag_option) +{ + HYPRE_Int i = hypre_gpu_get_grid_thread_id<1,1>(item); + + if (i >= n) + { + return; + } + + HYPRE_Int r1 = read_only_load(&rowid[i]); + HYPRE_Int c1 = read_only_load(&colid[i]); + HYPRE_Int j = read_only_load(&idx[i]); + + if (0 == diag_option) + { + if (j < nA) + { + HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; + HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; + if (r1 == r2 && c1 == c2) + { + mark[j] = c1; + } + else + { + mark[j] = -1; + } + } + } + else if (1 == diag_option) + { + if (j < nA) + { + if (r1 == c1) + { + mark[j] = -2; + } + else + { + HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; + HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; + if (r1 == r2 && c1 == c2) + { + mark[j] = c1; + } + else + { + mark[j] = -1; + } + } + } + } +} + +#endif // HYPRE_USING_SYCL + + #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_CSRMatrix* @@ -603,51 +1379,264 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A, return hypre_error_flag; } -__global__ void -hypreCUDAKernel_CSRMoveDiagFirst( HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *aa ) + +HYPRE_Int +hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) +{ + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + dim3 bDim, gDim; + + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); + + HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim, + nrows, A_i, A_j, A_data); + + hypre_SyncCudaComputeStream(hypre_handle()); + + return hypre_error_flag; +} + +HYPRE_Int +hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A ) +{ + if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + { + return 0; + } + + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim); + + HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRCheckDiagFirst, gDim, bDim, + hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result ); + + HYPRE_Int ierr = HYPRE_THRUST_CALL( reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); + + hypre_TFree(result, HYPRE_MEMORY_DEVICE); + + hypre_SyncCudaComputeStream(hypre_handle()); + + return ierr; +} + +/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v + * Does NOT assume diagonal is the first entry of each row of A + * In debug mode: + * Returns the number of rows that do not have diag in the pattern + * (i.e., structural zeroes on the diagonal) + */ +HYPRE_Int +hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A, + HYPRE_Complex v, + HYPRE_Real tol ) +{ + HYPRE_Int ierr = 0; + + if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + { + return ierr; + } + + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); + +#if HYPRE_DEBUG + HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); +#else + HYPRE_Int *result = NULL; +#endif + + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim, + v, hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), + tol, result ); + +#if HYPRE_DEBUG + ierr = HYPRE_THRUST_CALL( reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); + + hypre_TFree(result, HYPRE_MEMORY_DEVICE); +#endif + + hypre_SyncCudaComputeStream(hypre_handle()); + + return ierr; +} + +HYPRE_Int +hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, + HYPRE_Complex *new_diag, + HYPRE_Complex v, + HYPRE_Real tol ) +{ + HYPRE_Int ierr = 0; + + if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + { + return ierr; + } + + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); + +#if HYPRE_DEBUG + HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); +#else + HYPRE_Int *result = NULL; +#endif + + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim, + new_diag, v, hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), + tol, result ); + +#if HYPRE_DEBUG + ierr = HYPRE_THRUST_CALL( reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); + + hypre_TFree(result, HYPRE_MEMORY_DEVICE); +#endif + + hypre_SyncCudaComputeStream(hypre_handle()); + + return ierr; +} + +typedef thrust::tuple Int2; +struct Int2Unequal : public thrust::unary_function +{ + __host__ __device__ + bool operator()(const Int2& t) const + { + return (thrust::get<0>(t) != thrust::get<1>(t)); + } +}; + +HYPRE_Int +hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A) +{ + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); + HYPRE_Int new_nnz; + HYPRE_Int *new_ii; + HYPRE_Int *new_j; + HYPRE_Complex *new_data; + + new_nnz = HYPRE_THRUST_CALL( count_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz, + Int2Unequal() ); + + if (new_nnz == nnz) + { + /* no diagonal entries found */ + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + return hypre_error_flag; + } + + new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + + if (A_data) + { + new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); + + thrust::zip_iterator< thrust::tuple > new_end; + + new_end = HYPRE_THRUST_CALL( copy_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), + thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), + Int2Unequal() ); + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); + } + else + { + new_data = NULL; + + thrust::zip_iterator< thrust::tuple > new_end; + + new_end = HYPRE_THRUST_CALL( copy_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), + thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j)), + Int2Unequal() ); + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); + } + + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); + + hypre_CSRMatrixNumNonzeros(A) = new_nnz; + hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); + hypre_CSRMatrixJ(A) = new_j; + hypre_CSRMatrixData(A) = new_data; + hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); + + return hypre_error_flag; +} + +void +hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, + HYPRE_Int *CF_i, + HYPRE_Int *CF_j, + HYPRE_Complex *row_sum, + HYPRE_Int type, + HYPRE_Complex scal, + const char *set_or_add) { - HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + dim3 bDim, gDim; - if (row >= nrows) + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); + + if (type == 0) { - return; + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, + row_sum, scal, set_or_add[0] == 's' ); } - - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; - - if (lane < 2) + else if (type == 1) { - p = read_only_load(ia + row + lane); + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, + row_sum, scal, set_or_add[0] == 's' ); } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - - for (HYPRE_Int j = p + lane + 1; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) + else if (type == 2) { - hypre_int find_diag = j < q && ja[j] == row; - - if (find_diag) - { - ja[j] = ja[p]; - ja[p] = row; - HYPRE_Complex tmp = aa[p]; - aa[p] = aa[j]; - aa[j] = tmp; - } - - if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) - { - break; - } + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, + row_sum, scal, set_or_add[0] == 's' ); } + + hypre_SyncCudaComputeStream(hypre_handle()); } -HYPRE_Int -hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) +void +hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A, + HYPRE_Complex *d, + HYPRE_Int type) { HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); HYPRE_Complex *A_data = hypre_CSRMatrixData(A); @@ -658,516 +1647,957 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) bDim = hypre_GetDefaultDeviceBlockDimension(); gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMoveDiagFirst, gDim, bDim, - nrows, A_i, A_j, A_data); + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type ); hypre_SyncCudaComputeStream(hypre_handle()); +} - return hypre_error_flag; +/* return C = [A; B] */ +hypre_CSRMatrix* +hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B) +{ + hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) ); + + hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B), + hypre_CSRMatrixNumCols(A), + hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) ); + + HYPRE_Int *C_i = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE); + HYPRE_Int *C_j = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); + HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); + + hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1, + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + HYPRE_THRUST_CALL( transform, + C_i + hypre_CSRMatrixNumRows(A) + 1, + C_i + hypre_CSRMatrixNumRows(C) + 1, + thrust::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)), + C_i + hypre_CSRMatrixNumRows(A) + 1, + thrust::plus() ); + + hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + hypre_CSRMatrixI(C) = C_i; + hypre_CSRMatrixJ(C) = C_j; + hypre_CSRMatrixData(C) = C_a; + hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; + + return C; } -/* check if diagonal entry is the first one at each row - * Return: the number of rows that do not have the first entry as diagonal - * RL: only check if it's a non-empty row - */ -__global__ void -hypreCUDAKernel_CSRCheckDiagFirst( HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Int *result ) +/* A = alp * I */ +hypre_CSRMatrix * +hypre_CSRMatrixIdentityDevice(HYPRE_Int n, HYPRE_Complex alp) { - const HYPRE_Int row = hypre_cuda_get_grid_thread_id<1,1>(); - if (row < nrows) + hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n); + + hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE); + + HYPRE_THRUST_CALL( sequence, + hypre_CSRMatrixI(A), + hypre_CSRMatrixI(A) + n + 1, + 0 ); + + HYPRE_THRUST_CALL( sequence, + hypre_CSRMatrixJ(A), + hypre_CSRMatrixJ(A) + n, + 0 ); + + HYPRE_THRUST_CALL( fill, + hypre_CSRMatrixData(A), + hypre_CSRMatrixData(A) + n, + alp ); + + return A; +} + +/* this predicate compares first and second element in a tuple in absolute value */ +/* first is assumed to be complex, second to be real > 0 */ +struct cabsfirst_greaterthan_second_pred : public thrust::unary_function,bool> +{ + __host__ __device__ + bool operator()(const thrust::tuple& t) const { - result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row); + const HYPRE_Complex i = thrust::get<0>(t); + const HYPRE_Real j = thrust::get<1>(t); + + return hypre_cabs(i) > j; } -} +}; +/* drop the entries that are smaller than: + * tol if elmt_tols == null, + * elmt_tols[j] otherwise where j = 0...NumNonzeros(A) */ HYPRE_Int -hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A ) +hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A, + HYPRE_Real tol, + HYPRE_Real *elmt_tols) { - if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_ii = NULL; + HYPRE_Int new_nnz = 0; + HYPRE_Int *new_ii; + HYPRE_Int *new_j; + HYPRE_Complex *new_data; + + if (elmt_tols == NULL) { - return 0; + new_nnz = HYPRE_THRUST_CALL( count_if, + A_data, + A_data + nnz, + thrust::not1(less_than(tol)) ); + } + else + { + new_nnz = HYPRE_THRUST_CALL( count_if, + thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)), + thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)) + nnz, + cabsfirst_greaterthan_second_pred() ); } - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim); + if (new_nnz == nnz) + { + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + return hypre_error_flag; + } - HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRCheckDiagFirst, gDim, bDim, - hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result ); + if (!A_ii) + { + A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); + } + new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Int ierr = HYPRE_THRUST_CALL( reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); + thrust::zip_iterator< thrust::tuple > new_end; - hypre_TFree(result, HYPRE_MEMORY_DEVICE); + if (elmt_tols == NULL) + { + new_end = HYPRE_THRUST_CALL( copy_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, + A_data, + thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), + thrust::not1(less_than(tol)) ); + } + else + { + new_end = HYPRE_THRUST_CALL( copy_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, + thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)), + thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), + cabsfirst_greaterthan_second_pred() ); + } - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); - return ierr; + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); + + hypre_CSRMatrixNumNonzeros(A) = new_nnz; + hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); + hypre_CSRMatrixJ(A) = new_j; + hypre_CSRMatrixData(A) = new_data; + hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); + + return hypre_error_flag; } -__global__ void -hypreCUDAKernel_CSRMatrixFixZeroDiagDevice( HYPRE_Complex v, - HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *data, - HYPRE_Real tol, - HYPRE_Int *result ) +/* markA: array of size nnz(A), for pattern of (A and B), markA is the column indices as in A_J + * Otherwise, mark pattern not in A-B as -1 in markA + * Note the special treatment for diagonal entries of A (marked as -2) */ +HYPRE_Int +hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A, + hypre_CSRMatrix *B, + HYPRE_Int *markA, + HYPRE_Int diag_opt) { - const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Int nnzA = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int nnzB = hypre_CSRMatrixNumNonzeros(B); - if (row >= nrows) - { - return; - } + HYPRE_Int *Cii = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); + HYPRE_Int *Cjj = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); + HYPRE_Int *idx = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; - bool has_diag = false; + hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzA, hypre_CSRMatrixI(A), Cii); + hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA); + hypre_TMemcpy(Cjj, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + HYPRE_THRUST_CALL( sequence, idx, idx + nnzA + nnzB ); - if (lane < 2) - { - p = read_only_load(ia + row + lane); - } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); + HYPRE_THRUST_CALL( stable_sort_by_key, + thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)), + thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)) + nnzA + nnzB, + idx ); - for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; + hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - if (find_diag) - { - if (fabs(data[j]) <= tol) - { - data[j] = v; - } - } + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim); - if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) - { - has_diag = true; - break; - } - } + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixIntersectPattern, gDim, bDim, + nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt ); - if (result && !has_diag && lane == 0) - { - result[row] = 1; - } -} + hypre_TFree(Cii, HYPRE_MEMORY_DEVICE); + hypre_TFree(Cjj, HYPRE_MEMORY_DEVICE); + hypre_TFree(idx, HYPRE_MEMORY_DEVICE); -/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v - * Does NOT assume diagonal is the first entry of each row of A - * In debug mode: - * Returns the number of rows that do not have diag in the pattern - * (i.e., structural zeroes on the diagonal) - */ -HYPRE_Int -hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A, - HYPRE_Complex v, - HYPRE_Real tol ) -{ - HYPRE_Int ierr = 0; + return hypre_error_flag; +} - if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) - { - return ierr; - } +#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */ - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); +#if defined(HYPRE_USING_SYCL) -#if HYPRE_DEBUG - HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); -#else - HYPRE_Int *result = NULL; -#endif +hypre_CSRMatrix* +hypre_CSRMatrixAddDevice ( HYPRE_Complex alpha, + hypre_CSRMatrix *A, + HYPRE_Complex beta, + hypre_CSRMatrix *B ) +{ + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Int nrows_A = hypre_CSRMatrixNumRows(A); + HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); + HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Complex *B_data = hypre_CSRMatrixData(B); + HYPRE_Int *B_i = hypre_CSRMatrixI(B); + HYPRE_Int *B_j = hypre_CSRMatrixJ(B); + HYPRE_Int nrows_B = hypre_CSRMatrixNumRows(B); + HYPRE_Int ncols_B = hypre_CSRMatrixNumCols(B); + HYPRE_Int nnz_B = hypre_CSRMatrixNumNonzeros(B); + HYPRE_Complex *C_data; + HYPRE_Int *C_i; + HYPRE_Int *C_j; + HYPRE_Int nnzC; + hypre_CSRMatrix *C; + + if (nrows_A != nrows_B || ncols_A != ncols_B) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! Incompatible matrix dimensions!\n"); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim, - v, hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), - tol, result ); + return NULL; + } -#if HYPRE_DEBUG - ierr = HYPRE_THRUST_CALL( reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); + hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B, + A_i, A_j, alpha, A_data, NULL, B_i, B_j, beta, B_data, NULL, NULL, + &nnzC, &C_i, &C_j, &C_data); - hypre_TFree(result, HYPRE_MEMORY_DEVICE); -#endif + C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC); + hypre_CSRMatrixI(C) = C_i; + hypre_CSRMatrixJ(C) = C_j; + hypre_CSRMatrixData(C) = C_data; + hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncCudaComputeStream(hypre_handle()); - return ierr; + return C; } -__global__ void -hypreCUDAKernel_CSRMatrixReplaceDiagDevice( HYPRE_Complex *new_diag, - HYPRE_Complex v, - HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *data, - HYPRE_Real tol, - HYPRE_Int *result ) +hypre_CSRMatrix* +hypre_CSRMatrixMultiplyDevice( hypre_CSRMatrix *A, + hypre_CSRMatrix *B) { - const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); + HYPRE_Int nrows_B = hypre_CSRMatrixNumRows(B); + hypre_CSRMatrix *C; - if (row >= nrows) - { - return; - } + if (ncols_A != nrows_B) + { + hypre_printf("Warning! incompatible matrix dimensions!\n"); + hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! incompatible matrix dimensions!\n"); - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; - bool has_diag = false; + return NULL; + } - if (lane < 2) - { - p = read_only_load(ia + row + lane); - } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); + hypreDevice_CSRSpGemm(A, B, &C); - for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; + hypre_SyncCudaComputeStream(hypre_handle()); - if (find_diag) - { - HYPRE_Complex d = read_only_load(&new_diag[row]); - if (fabs(d) <= tol) - { - d = v; - } - data[j] = d; - } + return C; +} - if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) - { - has_diag = true; - break; - } - } +hypre_CSRMatrix* +hypre_CSRMatrixTripleMultiplyDevice ( hypre_CSRMatrix *A, + hypre_CSRMatrix *B, + hypre_CSRMatrix *C ) +{ + hypre_CSRMatrix *BC = hypre_CSRMatrixMultiplyDevice(B, C); + hypre_CSRMatrix *ABC = hypre_CSRMatrixMultiplyDevice(A, BC); - if (result && !has_diag && lane == 0) - { - result[row] = 1; - } + hypre_CSRMatrixDestroy(BC); + + return ABC; } HYPRE_Int -hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, - HYPRE_Complex *new_diag, - HYPRE_Complex v, - HYPRE_Real tol ) +hypre_CSRMatrixTriLowerUpperSolveDevice(char uplo, + hypre_CSRMatrix *A, + HYPRE_Real *l1_norms, + hypre_Vector *f, + hypre_Vector *u ) { - HYPRE_Int ierr = 0; +#if defined(HYPRE_USING_CUSPARSE) + hypre_CSRMatrixTriLowerUpperSolveCusparse(uplo, A, l1_norms, f, u); +#elif defined(HYPRE_USING_ROCSPARSE) + hypre_CSRMatrixTriLowerUpperSolveRocsparse(uplo, A, l1_norms, f, u); +#else + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "hypre_CSRMatrixTriLowerUpperSolveDevice requires configuration with either cusparse or rocsparse\n"); +#endif + return hypre_error_flag; +} - if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) - { - return ierr; - } +/* split CSR matrix B_ext (extended rows of parcsr B) into diag part and offd part + * corresponding to B. + * Input col_map_offd_B: + * Output col_map_offd_C: union of col_map_offd_B and offd-indices of Bext_offd + * map_B_to_C: mapping from col_map_offd_B to col_map_offd_C + */ - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); +HYPRE_Int +hypre_CSRMatrixSplitDevice( hypre_CSRMatrix *B_ext, + HYPRE_BigInt first_col_diag_B, + HYPRE_BigInt last_col_diag_B, + HYPRE_Int num_cols_offd_B, + HYPRE_BigInt *col_map_offd_B, + HYPRE_Int **map_B_to_C_ptr, + HYPRE_Int *num_cols_offd_C_ptr, + HYPRE_BigInt **col_map_offd_C_ptr, + hypre_CSRMatrix **B_ext_diag_ptr, + hypre_CSRMatrix **B_ext_offd_ptr ) +{ + HYPRE_Int num_rows = hypre_CSRMatrixNumRows(B_ext); + HYPRE_Int B_ext_nnz = hypre_CSRMatrixNumNonzeros(B_ext); + + HYPRE_Int *B_ext_ii = hypre_TAlloc(HYPRE_Int, B_ext_nnz, HYPRE_MEMORY_DEVICE); + hypreDevice_CsrRowPtrsToIndices_v2(num_rows, B_ext_nnz, hypre_CSRMatrixI(B_ext), B_ext_ii); + + HYPRE_Int B_ext_diag_nnz; + HYPRE_Int B_ext_offd_nnz; + HYPRE_Int ierr; + + ierr = hypre_CSRMatrixSplitDevice_core( 0, + num_rows, + B_ext_nnz, + NULL, + hypre_CSRMatrixBigJ(B_ext), + NULL, + NULL, + first_col_diag_B, + last_col_diag_B, + num_cols_offd_B, + NULL, + NULL, + NULL, + NULL, + &B_ext_diag_nnz, + NULL, + NULL, + NULL, + NULL, + &B_ext_offd_nnz, + NULL, + NULL, + NULL, + NULL ); + + HYPRE_Int *B_ext_diag_ii = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Int *B_ext_diag_j = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Complex *B_ext_diag_a = hypre_TAlloc(HYPRE_Complex, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + + HYPRE_Int *B_ext_offd_ii = hypre_TAlloc(HYPRE_Int, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Int *B_ext_offd_j = hypre_TAlloc(HYPRE_Int, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Complex *B_ext_offd_a = hypre_TAlloc(HYPRE_Complex, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); + + ierr = hypre_CSRMatrixSplitDevice_core( 1, + num_rows, + B_ext_nnz, + B_ext_ii, + hypre_CSRMatrixBigJ(B_ext), + hypre_CSRMatrixData(B_ext), + NULL, + first_col_diag_B, + last_col_diag_B, + num_cols_offd_B, + col_map_offd_B, + map_B_to_C_ptr, + num_cols_offd_C_ptr, + col_map_offd_C_ptr, + &B_ext_diag_nnz, + B_ext_diag_ii, + B_ext_diag_j, + B_ext_diag_a, + NULL, + &B_ext_offd_nnz, + B_ext_offd_ii, + B_ext_offd_j, + B_ext_offd_a, + NULL ); + + hypre_TFree(B_ext_ii, HYPRE_MEMORY_DEVICE); + + /* convert to row ptrs */ + HYPRE_Int *B_ext_diag_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_diag_nnz, B_ext_diag_ii); + HYPRE_Int *B_ext_offd_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_offd_nnz, B_ext_offd_ii); + + hypre_TFree(B_ext_diag_ii, HYPRE_MEMORY_DEVICE); + hypre_TFree(B_ext_offd_ii, HYPRE_MEMORY_DEVICE); + + /* create diag and offd CSR */ + hypre_CSRMatrix *B_ext_diag = hypre_CSRMatrixCreate(num_rows, last_col_diag_B - first_col_diag_B + 1, B_ext_diag_nnz); + hypre_CSRMatrix *B_ext_offd = hypre_CSRMatrixCreate(num_rows, *num_cols_offd_C_ptr, B_ext_offd_nnz); + + hypre_CSRMatrixI(B_ext_diag) = B_ext_diag_i; + hypre_CSRMatrixJ(B_ext_diag) = B_ext_diag_j; + hypre_CSRMatrixData(B_ext_diag) = B_ext_diag_a; + hypre_CSRMatrixNumNonzeros(B_ext_diag) = B_ext_diag_nnz; + hypre_CSRMatrixMemoryLocation(B_ext_diag) = HYPRE_MEMORY_DEVICE; + + hypre_CSRMatrixI(B_ext_offd) = B_ext_offd_i; + hypre_CSRMatrixJ(B_ext_offd) = B_ext_offd_j; + hypre_CSRMatrixData(B_ext_offd) = B_ext_offd_a; + hypre_CSRMatrixNumNonzeros(B_ext_offd) = B_ext_offd_nnz; + hypre_CSRMatrixMemoryLocation(B_ext_offd) = HYPRE_MEMORY_DEVICE; + + *B_ext_diag_ptr = B_ext_diag; + *B_ext_offd_ptr = B_ext_offd; + + hypre_SyncCudaComputeStream(hypre_handle()); + + return ierr; +} -#if HYPRE_DEBUG - HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); +HYPRE_Int +hypre_CSRMatrixSplitDevice_core( HYPRE_Int job, /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */ + HYPRE_Int num_rows, + HYPRE_Int B_ext_nnz, + HYPRE_Int *B_ext_ii, /* Note: this is NOT row pointers as in CSR but row indices as in COO */ + HYPRE_BigInt *B_ext_bigj, /* Note: [BigInt] global column indices */ + HYPRE_Complex *B_ext_data, + char *B_ext_xata, /* companion data with B_ext_data; NULL if none */ + HYPRE_BigInt first_col_diag_B, + HYPRE_BigInt last_col_diag_B, + HYPRE_Int num_cols_offd_B, + HYPRE_BigInt *col_map_offd_B, + HYPRE_Int **map_B_to_C_ptr, + HYPRE_Int *num_cols_offd_C_ptr, + HYPRE_BigInt **col_map_offd_C_ptr, + HYPRE_Int *B_ext_diag_nnz_ptr, + HYPRE_Int *B_ext_diag_ii, /* memory allocated outside */ + HYPRE_Int *B_ext_diag_j, + HYPRE_Complex *B_ext_diag_data, + char *B_ext_diag_xata, /* companion with B_ext_diag_data_ptr; NULL if none */ + HYPRE_Int *B_ext_offd_nnz_ptr, + HYPRE_Int *B_ext_offd_ii, /* memory allocated outside */ + HYPRE_Int *B_ext_offd_j, + HYPRE_Complex *B_ext_offd_data, + char *B_ext_offd_xata /* companion with B_ext_offd_data_ptr; NULL if none */ ) +{ + HYPRE_Int B_ext_diag_nnz; + HYPRE_Int B_ext_offd_nnz; + HYPRE_BigInt *B_ext_diag_bigj = NULL; + HYPRE_BigInt *B_ext_offd_bigj = NULL; + HYPRE_BigInt *col_map_offd_C; + HYPRE_Int *map_B_to_C = NULL; + HYPRE_Int num_cols_offd_C; + + in_range pred1(first_col_diag_B, last_col_diag_B); + + /* get diag and offd nnz */ + if (job == 0) { + /* query the nnz's */ + B_ext_diag_nnz = HYPRE_ONEDPL_CALL( std::count_if, + B_ext_bigj, + B_ext_bigj + B_ext_nnz, + pred1 ); + B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz; + + *B_ext_diag_nnz_ptr = B_ext_diag_nnz; + *B_ext_offd_nnz_ptr = B_ext_offd_nnz; + + return hypre_error_flag; + } + else { + B_ext_diag_nnz = *B_ext_diag_nnz_ptr; + B_ext_offd_nnz = *B_ext_offd_nnz_ptr; + } + + /* copy to diag */ + B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + + if (B_ext_diag_xata) { + auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, /* first */ + first + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata),/* result */ + pred1 ); + + //hypre_assert( std::get<0>(new_end.get_iterator_tuple() == B_ext_diag_ii + B_ext_diag_nnz ); + } + else { + auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, /* first */ + first + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data), /* result */ + pred1 ); + + //hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz ); + } + + HYPRE_BigInt *const_iterator = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + hypre_DeviceDataComputeStream(hypre_handle())->fill(const_iterator, first_col_diag_B, B_ext_diag_nnz*sizeof(HYPRE_BigInt)).wait(); + HYPRE_ONEDPL_CALL( std::transform, + B_ext_diag_bigj, + B_ext_diag_bigj + B_ext_diag_nnz, + const_iterator, //dpct::make_constant_iterator(first_col_diag_B), + B_ext_diag_j, + std::minus() ); + hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE); + + hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE); + + /* copy to offd */ + B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); + + if (B_ext_offd_xata) { + auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, /* first */ + first + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata), /* result */ + std::not1(pred1) ); + + // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); + } + else { + auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, /* first */ + first + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data), /* result */ + std::not1(pred1) ); + + // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); + } + + /* offd map of B_ext_offd Union col_map_offd_B */ + col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(col_map_offd_C, B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B, HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + HYPRE_ONEDPL_CALL( std::sort, + col_map_offd_C, + col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); + + HYPRE_BigInt *new_end = HYPRE_ONEDPL_CALL( std::unique, + col_map_offd_C, + col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); + + num_cols_offd_C = new_end - col_map_offd_C; + +#if 1 + HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE); + col_map_offd_C = tmp; #else - HYPRE_Int *result = NULL; + col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE); #endif - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixReplaceDiagDevice, gDim, bDim, - new_diag, v, hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), - tol, result ); + /* create map from col_map_offd_B */ + if (num_cols_offd_B) { + map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE); + HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, + col_map_offd_C, + col_map_offd_C + num_cols_offd_C, + col_map_offd_B, + col_map_offd_B + num_cols_offd_B, + map_B_to_C ); + } + + HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, + col_map_offd_C, + col_map_offd_C + num_cols_offd_C, + B_ext_offd_bigj, + B_ext_offd_bigj + B_ext_offd_nnz, + B_ext_offd_j ); + + hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE); + + if (map_B_to_C_ptr) { + *map_B_to_C_ptr = map_B_to_C; + } + *num_cols_offd_C_ptr = num_cols_offd_C; + *col_map_offd_C_ptr = col_map_offd_C; + + return hypre_error_flag; +} -#if HYPRE_DEBUG - ierr = HYPRE_THRUST_CALL( reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); +/*-------------------------------------------------------------------------- + * hypre_CSRMatrixAddPartial: + * adds matrix rows in the CSR matrix B to the CSR Matrix A, where row_nums[i] + * defines to which row of A the i-th row of B is added, and returns a CSR Matrix C; + * Repeated row indices are allowed in row_nums + * Note: The routine does not check for 0-elements which might be generated + * through cancellation of elements in A and B or already contained + * in A and B. To remove those, use hypre_CSRMatrixDeleteZeros + *--------------------------------------------------------------------------*/ + +hypre_CSRMatrix* +hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A, + hypre_CSRMatrix *B, + HYPRE_Int *row_nums) +{ + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Int nrows_A = hypre_CSRMatrixNumRows(A); + HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); + HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Complex *B_data = hypre_CSRMatrixData(B); + HYPRE_Int *B_i = hypre_CSRMatrixI(B); + HYPRE_Int *B_j = hypre_CSRMatrixJ(B); + HYPRE_Int nrows_B = hypre_CSRMatrixNumRows(B); + HYPRE_Int ncols_B = hypre_CSRMatrixNumCols(B); + HYPRE_Int nnz_B = hypre_CSRMatrixNumNonzeros(B); + HYPRE_Complex *C_data; + HYPRE_Int *C_i; + HYPRE_Int *C_j; + HYPRE_Int nnzC; + hypre_CSRMatrix *C; + + if (ncols_A != ncols_B) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! incompatible matrix dimensions!\n"); + + return NULL; + } - hypre_TFree(result, HYPRE_MEMORY_DEVICE); -#endif + hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B, A_i, A_j, 1.0, A_data, NULL, B_i, B_j, 1.0, B_data, NULL, row_nums, + &nnzC, &C_i, &C_j, &C_data); - hypre_SyncCudaComputeStream(hypre_handle()); + C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC); + hypre_CSRMatrixI(C) = C_i; + hypre_CSRMatrixJ(C) = C_j; + hypre_CSRMatrixData(C) = C_data; + hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - return ierr; -} + hypre_SyncCudaComputeStream(hypre_handle()); -typedef thrust::tuple Int2; -struct Int2Unequal : public thrust::unary_function -{ - __host__ __device__ - bool operator()(const Int2& t) const - { - return (thrust::get<0>(t) != thrust::get<1>(t)); - } -}; + return C; +} HYPRE_Int -hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A) +hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A, + HYPRE_Real *colnnz) { - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); - HYPRE_Int new_nnz; - HYPRE_Int *new_ii; - HYPRE_Int *new_j; - HYPRE_Complex *new_data; + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); + HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int *A_j_sorted; + HYPRE_Int num_reduced_col_indices; + HYPRE_Int *reduced_col_indices; + HYPRE_Int *reduced_col_nnz; + + A_j_sorted = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(A_j_sorted, A_j, HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + HYPRE_ONEDPL_CALL(std::sort, A_j_sorted, A_j_sorted + nnz_A); + + reduced_col_indices = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); + reduced_col_nnz = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); + + // ABB: Replace values in-place with dpct::make_constant_iterator(1) + HYPRE_Int* values = hypre_TAlloc(HYPRE_Int, nnz_A, hypre_MEMORY_UNIFIED); + hypre_DeviceDataComputeStream(hypre_handle())->fill(values, 1, nnz_A*sizeof(HYPRE_Int)).wait(); + std::pair new_end = + HYPRE_ONEDPL_CALL( oneapi::dpl::reduce_by_segment, A_j_sorted, A_j_sorted + nnz_A, + values, + reduced_col_indices, + reduced_col_nnz ); + + hypre_assert(new_end.first - reduced_col_indices == new_end.second - reduced_col_nnz); + + num_reduced_col_indices = new_end.first - reduced_col_indices; + + hypre_Memset(colnnz, 0, ncols_A * sizeof(HYPRE_Real), HYPRE_MEMORY_DEVICE); + HYPRE_ONEDPL_CALL( oneapi::dpl::copy, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices, + oneapi::dpl::make_permutation_iterator(colnnz, reduced_col_indices) ); + + hypre_TFree(A_j_sorted, HYPRE_MEMORY_DEVICE); + hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE); + hypre_TFree(reduced_col_nnz, HYPRE_MEMORY_DEVICE); + hypre_TFree(values, HYPRE_MEMORY_UNIFIED); + + hypre_SyncCudaComputeStream(hypre_handle()); + + return hypre_error_flag; +} - new_nnz = HYPRE_THRUST_CALL( count_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz, - Int2Unequal() ); - if (new_nnz == nnz) - { - /* no diagonal entries found */ - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - return hypre_error_flag; - } +HYPRE_Int +hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) +{ + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + sycl::range<1> bDim, gDim; - new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - if (A_data) - { - new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim, + nrows, A_i, A_j, A_data); - thrust::zip_iterator< thrust::tuple > new_end; + hypre_SyncCudaComputeStream(hypre_handle()); - new_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), - thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), - Int2Unequal() ); + return hypre_error_flag; +} - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); - } - else - { - new_data = NULL; +HYPRE_Int +hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A ) +{ + if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + { + return 0; + } - thrust::zip_iterator< thrust::tuple > new_end; + sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim); - new_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), - thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j)), - Int2Unequal() ); + HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRCheckDiagFirst, gDim, bDim, + hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result ); - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); - } + HYPRE_Int ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); + hypre_TFree(result, HYPRE_MEMORY_DEVICE); - hypre_CSRMatrixNumNonzeros(A) = new_nnz; - hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); - hypre_CSRMatrixJ(A) = new_j; - hypre_CSRMatrixData(A) = new_data; - hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); + hypre_SyncCudaComputeStream(hypre_handle()); - return hypre_error_flag; + return ierr; } -/* type == 0, sum, - * 1, abs sum (l-1) - * 2, square sum (l-2) +/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v + * Does NOT assume diagonal is the first entry of each row of A + * In debug mode: + * Returns the number of rows that do not have diag in the pattern + * (i.e., structural zeroes on the diagonal) */ -template -__global__ void -hypreCUDAKernel_CSRRowSum( HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *aa, - HYPRE_Int *CF_i, - HYPRE_Int *CF_j, - HYPRE_Complex *row_sum, - HYPRE_Complex scal, - HYPRE_Int set) +HYPRE_Int +hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A, + HYPRE_Complex v, + HYPRE_Real tol ) { - HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>(); - - if (row_i >= nrows) - { - return; - } - - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; - - if (lane < 2) - { - p = read_only_load(ia + row_i + lane); - } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - - HYPRE_Complex row_sum_i = 0.0; - - for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) - { - if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) ) - { - continue; - } + HYPRE_Int ierr = 0; - HYPRE_Complex aii = aa[j]; + if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + { + return ierr; + } - if (type == 0) - { - row_sum_i += aii; - } - else if (type == 1) - { - row_sum_i += fabs(aii); - } - else if (type == 2) - { - row_sum_i += aii * aii; - } - } + sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); - row_sum_i = warp_reduce_sum(row_sum_i); +#if HYPRE_DEBUG + HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); +#else + HYPRE_Int *result = NULL; +#endif - if (lane == 0) - { - if (set) - { - row_sum[row_i] = scal * row_sum_i; - } - else - { - row_sum[row_i] += scal * row_sum_i; - } - } -} + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim, + v, hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), + tol, result ); -void -hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, - HYPRE_Int *CF_i, - HYPRE_Int *CF_j, - HYPRE_Complex *row_sum, - HYPRE_Int type, - HYPRE_Complex scal, - const char *set_or_add) -{ - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - dim3 bDim, gDim; +#if HYPRE_DEBUG + ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); - bDim = hypre_GetDefaultDeviceBlockDimension(); - gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); + hypre_TFree(result, HYPRE_MEMORY_DEVICE); +#endif - if (type == 0) - { - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, - row_sum, scal, set_or_add[0] == 's' ); - } - else if (type == 1) - { - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, - row_sum, scal, set_or_add[0] == 's' ); - } - else if (type == 2) - { - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, - row_sum, scal, set_or_add[0] == 's' ); - } + hypre_SyncCudaComputeStream(hypre_handle()); - hypre_SyncCudaComputeStream(hypre_handle()); + return ierr; } -/* type 0: diag - * 1: abs diag - * 2: diag inverse - * 3: diag inverse sqrt - * 4: abs diag inverse sqrt - */ -__global__ void -hypreCUDAKernel_CSRExtractDiag( HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *aa, - HYPRE_Complex *d, - HYPRE_Int type) +HYPRE_Int +hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, + HYPRE_Complex *new_diag, + HYPRE_Complex v, + HYPRE_Real tol ) { - HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); - - if (row >= nrows) - { - return; - } + HYPRE_Int ierr = 0; - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; - - if (lane < 2) - { - p = read_only_load(ia + row + lane); - } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); + if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + { + return ierr; + } - HYPRE_Int has_diag = 0; + sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); - for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && ja[j] == row; +#if HYPRE_DEBUG + HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); +#else + HYPRE_Int *result = NULL; +#endif - if (find_diag) - { - if (type == 0) - { - d[row] = aa[j]; - } - else if (type == 1) - { - d[row] = fabs(aa[j]); - } - else if (type == 2) - { - d[row] = 1.0 / aa[j]; - } - else if (type == 3) - { - d[row] = 1.0 / sqrt(aa[j]); - } - else if (type == 4) - { - d[row] = 1.0 / sqrt(fabs(aa[j])); - } - } + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim, + new_diag, v, hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), + tol, result ); - if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) - { - has_diag = 1; - break; - } - } +#if HYPRE_DEBUG + ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); - if (!has_diag && lane == 0) - { - d[row] = 0.0; - } + hypre_TFree(result, HYPRE_MEMORY_DEVICE); +#endif + + hypre_SyncCudaComputeStream(hypre_handle()); + + return ierr; +} + +HYPRE_Int +hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A) +{ + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); + HYPRE_Int new_nnz; + HYPRE_Int *new_ii; + HYPRE_Int *new_j; + HYPRE_Complex *new_data; + + auto zipped_begin = oneapi::dpl::make_zip_iterator(A_ii, A_j); + new_nnz = HYPRE_ONEDPL_CALL( std::count_if, + zipped_begin, zipped_begin + nnz, + [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); + + if (new_nnz == nnz) + { + /* no diagonal entries found */ + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + return hypre_error_flag; + } + + new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + + if (A_data) + { + new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); + + auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, first + nnz, + oneapi::dpl::make_zip_iterator(A_ii, A_j), + oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data), + [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); + + // todo: fix this + // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz ); + } + else + { + new_data = NULL; + + auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, first + nnz, + first, + oneapi::dpl::make_zip_iterator(new_ii, new_j), + [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); + + // todo: fix this + // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz ); + } + + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); + + hypre_CSRMatrixNumNonzeros(A) = new_nnz; + hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); + hypre_CSRMatrixJ(A) = new_j; + hypre_CSRMatrixData(A) = new_data; + hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); + + return hypre_error_flag; +} + +void +hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, + HYPRE_Int *CF_i, + HYPRE_Int *CF_j, + HYPRE_Complex *row_sum, + HYPRE_Int type, + HYPRE_Complex scal, + const char *set_or_add) +{ + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + sycl::range<1> bDim, gDim; + + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); + + if (type == 0) + { + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, + row_sum, scal, set_or_add[0] == 's' ); + } + else if (type == 1) + { + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, + row_sum, scal, set_or_add[0] == 's' ); + } + else if (type == 2) + { + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, + row_sum, scal, set_or_add[0] == 's' ); + } + + hypre_SyncCudaComputeStream(hypre_handle()); } void @@ -1175,101 +2605,107 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A, HYPRE_Complex *d, HYPRE_Int type) { - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - dim3 bDim, gDim; + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + sycl::range<1> bDim, gDim; - bDim = hypre_GetDefaultDeviceBlockDimension(); - gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); + bDim = hypre_GetDefaultDeviceBlockDimension(); + gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type ); + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type ); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncCudaComputeStream(hypre_handle()); } /* return C = [A; B] */ hypre_CSRMatrix* hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B) { - hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) ); - - hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B), - hypre_CSRMatrixNumCols(A), - hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) ); - - HYPRE_Int *C_i = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE); - HYPRE_Int *C_j = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); - HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); - - hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1, - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL( transform, - C_i + hypre_CSRMatrixNumRows(A) + 1, - C_i + hypre_CSRMatrixNumRows(C) + 1, - thrust::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)), - C_i + hypre_CSRMatrixNumRows(A) + 1, - thrust::plus() ); - - hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - - hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - - hypre_CSRMatrixI(C) = C_i; - hypre_CSRMatrixJ(C) = C_j; - hypre_CSRMatrixData(C) = C_a; - hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - - return C; + hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) ); + + hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B), + hypre_CSRMatrixNumCols(A), + hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) ); + + HYPRE_Int *C_i = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE); + HYPRE_Int *C_j = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); + HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); + + hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1, + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + + // ABB: check the right size B_ext_diag_nnz + HYPRE_Int *const_iterator = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + hypre_DeviceDataComputeStream(hypre_handle())->fill(const_iterator, hypre_CSRMatrixNumNonzeros(A), B_ext_diag_nnz*sizeof(HYPRE_Int)).wait(); + HYPRE_ONEDPL_CALL( std::transform, + C_i + hypre_CSRMatrixNumRows(A) + 1, + C_i + hypre_CSRMatrixNumRows(C) + 1, + const_iterator, //dpct::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)), + C_i + hypre_CSRMatrixNumRows(A) + 1, + std::plus() ); + hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE); + + + hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + hypre_CSRMatrixI(C) = C_i; + hypre_CSRMatrixJ(C) = C_j; + hypre_CSRMatrixData(C) = C_a; + hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; + + return C; } /* A = alp * I */ hypre_CSRMatrix * hypre_CSRMatrixIdentityDevice(HYPRE_Int n, HYPRE_Complex alp) { - hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n); + hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n); - hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE); + hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL( sequence, - hypre_CSRMatrixI(A), - hypre_CSRMatrixI(A) + n + 1, - 0 ); + HYPRE_ONEDPL_CALL( dpct::iota, + hypre_CSRMatrixI(A), + hypre_CSRMatrixI(A) + n + 1, + 0 ); - HYPRE_THRUST_CALL( sequence, - hypre_CSRMatrixJ(A), - hypre_CSRMatrixJ(A) + n, - 0 ); + HYPRE_ONEDPL_CALL( dpct::iota, + hypre_CSRMatrixJ(A), + hypre_CSRMatrixJ(A) + n, + 0 ); - HYPRE_THRUST_CALL( fill, - hypre_CSRMatrixData(A), - hypre_CSRMatrixData(A) + n, - alp ); + HYPRE_ONEDPL_CALL( std::fill, + hypre_CSRMatrixData(A), + hypre_CSRMatrixData(A) + n, + alp ); - return A; + return A; } /* this predicate compares first and second element in a tuple in absolute value */ /* first is assumed to be complex, second to be real > 0 */ -struct cabsfirst_greaterthan_second_pred : public thrust::unary_function,bool> +struct cabsfirst_greaterthan_second_pred { - __host__ __device__ - bool operator()(const thrust::tuple& t) const - { - const HYPRE_Complex i = thrust::get<0>(t); - const HYPRE_Real j = thrust::get<1>(t); + bool operator()(const std::tuple& t) const + { + const HYPRE_Complex i = std::get<0>(t); + const HYPRE_Real j = std::get<1>(t); - return hypre_cabs(i) > j; - } + return hypre_cabs(i) > j; + } }; /* drop the entries that are smaller than: @@ -1280,145 +2716,84 @@ hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A, HYPRE_Real tol, HYPRE_Real *elmt_tols) { - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_ii = NULL; - HYPRE_Int new_nnz = 0; - HYPRE_Int *new_ii; - HYPRE_Int *new_j; - HYPRE_Complex *new_data; - - if (elmt_tols == NULL) - { - new_nnz = HYPRE_THRUST_CALL( count_if, + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_ii = NULL; + HYPRE_Int new_nnz = 0; + HYPRE_Int *new_ii; + HYPRE_Int *new_j; + HYPRE_Complex *new_data; + + if (elmt_tols == NULL) + { + // abb TODO: issue with working here + new_nnz = HYPRE_ONEDPL_CALL( std::count_if, A_data, A_data + nnz, - thrust::not1(less_than(tol)) ); - } - else - { - new_nnz = HYPRE_THRUST_CALL( count_if, - thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)), - thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)) + nnz, + std::not_fn(less_than(tol)) ); + } + else + { + auto first = oneapi::dpl::make_zip_iterator(A_data, elmt_tols); + new_nnz = HYPRE_ONEDPL_CALL( std::count_if, + first, + first + nnz, cabsfirst_greaterthan_second_pred() ); - } + } - if (new_nnz == nnz) - { + if (new_nnz == nnz) + { hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); return hypre_error_flag; - } + } - if (!A_ii) - { + if (!A_ii) + { A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); - } - new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); - - thrust::zip_iterator< thrust::tuple > new_end; - - if (elmt_tols == NULL) - { - new_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, + } + new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); + + oneapi::dpl::zip_iterator< HYPRE_Int*, HYPRE_Int*, HYPRE_Complex* > new_end; + + if (elmt_tols == NULL) + { + auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data); + new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, first + nnz, A_data, - thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), - thrust::not1(less_than(tol)) ); - } - else - { - new_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, - thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)), - thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), + oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data), + std::not_fn(less_than(tol)) ); + } + else + { + auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data); + new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, first + nnz, + oneapi::dpl::make_zip_iterator(A_data, elmt_tols), + oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data), cabsfirst_greaterthan_second_pred() ); - } - - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); - - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); - - hypre_CSRMatrixNumNonzeros(A) = new_nnz; - hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); - hypre_CSRMatrixJ(A) = new_j; - hypre_CSRMatrixData(A) = new_data; - hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); - - return hypre_error_flag; -} + } -/* mark is of size nA - * diag_option: 1: special treatment for diag entries, mark as -2 - */ -__global__ void -hypreCUDAKernel_CSRMatrixIntersectPattern(HYPRE_Int n, - HYPRE_Int nA, - HYPRE_Int *rowid, - HYPRE_Int *colid, - HYPRE_Int *idx, - HYPRE_Int *mark, - HYPRE_Int diag_option) -{ - HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>(); + // todo: fix this + // hypre_assert( thrust::get<0>(*new_end) == new_ii + new_nnz ); - if (i >= n) - { - return; - } + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); - HYPRE_Int r1 = read_only_load(&rowid[i]); - HYPRE_Int c1 = read_only_load(&colid[i]); - HYPRE_Int j = read_only_load(&idx[i]); + hypre_CSRMatrixNumNonzeros(A) = new_nnz; + hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); + hypre_CSRMatrixJ(A) = new_j; + hypre_CSRMatrixData(A) = new_data; + hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); - if (0 == diag_option) - { - if (j < nA) - { - HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; - HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; - if (r1 == r2 && c1 == c2) - { - mark[j] = c1; - } - else - { - mark[j] = -1; - } - } - } - else if (1 == diag_option) - { - if (j < nA) - { - if (r1 == c1) - { - mark[j] = -2; - } - else - { - HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; - HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; - if (r1 == r2 && c1 == c2) - { - mark[j] = c1; - } - else - { - mark[j] = -1; - } - } - } - } + return hypre_error_flag; } /* markA: array of size nnz(A), for pattern of (A and B), markA is the column indices as in A_J @@ -1430,41 +2805,42 @@ hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A, HYPRE_Int *markA, HYPRE_Int diag_opt) { - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnzA = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int nnzB = hypre_CSRMatrixNumNonzeros(B); + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Int nnzA = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int nnzB = hypre_CSRMatrixNumNonzeros(B); - HYPRE_Int *Cii = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); - HYPRE_Int *Cjj = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); - HYPRE_Int *idx = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); + HYPRE_Int *Cii = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); + HYPRE_Int *Cjj = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); + HYPRE_Int *idx = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); - hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzA, hypre_CSRMatrixI(A), Cii); - hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA); - hypre_TMemcpy(Cjj, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL( sequence, idx, idx + nnzA + nnzB ); + hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzA, hypre_CSRMatrixI(A), Cii); + hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA); + hypre_TMemcpy(Cjj, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + HYPRE_ONEDPL_CALL( dpct::iota, idx, idx + nnzA + nnzB, 0 ); - HYPRE_THRUST_CALL( stable_sort_by_key, - thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)), - thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)) + nnzA + nnzB, - idx ); + auto keys_begin = oneapi::dpl::make_zip_iterator(Cii, Cjj); + auto zipped_begin = oneapi::dpl::make_zip_iterator(keys_begin, idx); + HYPRE_ONEDPL_CALL( std::stable_sort, zipped_begin, zipped_begin + nnzA + nnzB, + [](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); } ); - hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim); + sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); + sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixIntersectPattern, gDim, bDim, - nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt ); + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixIntersectPattern, gDim, bDim, + nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt ); - hypre_TFree(Cii, HYPRE_MEMORY_DEVICE); - hypre_TFree(Cjj, HYPRE_MEMORY_DEVICE); - hypre_TFree(idx, HYPRE_MEMORY_DEVICE); + hypre_TFree(Cii, HYPRE_MEMORY_DEVICE); + hypre_TFree(Cjj, HYPRE_MEMORY_DEVICE); + hypre_TFree(idx, HYPRE_MEMORY_DEVICE); - return hypre_error_flag; + return hypre_error_flag; } -#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */ +#endif /* HYPRE_USING_SYCL */ + #if defined(HYPRE_USING_GPU) diff --git a/src/seq_mv/csr_matrix.c b/src/seq_mv/csr_matrix.c index f387de02e2..7ea19c3b56 100644 --- a/src/seq_mv/csr_matrix.c +++ b/src/seq_mv/csr_matrix.c @@ -44,7 +44,7 @@ hypre_CSRMatrixCreate( HYPRE_Int num_rows, /* set defaults */ hypre_CSRMatrixOwnsData(matrix) = 1; -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) hypre_CSRMatrixSortedJ(matrix) = NULL; hypre_CSRMatrixSortedData(matrix) = NULL; hypre_CSRMatrixCsrsvData(matrix) = NULL; diff --git a/src/seq_mv/csr_spgemm_device_attempt.c b/src/seq_mv/csr_spgemm_device_attempt.c index d6b23a99d8..8fd268de09 100644 --- a/src/seq_mv/csr_spgemm_device_attempt.c +++ b/src/seq_mv/csr_spgemm_device_attempt.c @@ -501,7 +501,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int m, // for cases where one WARP works on a row dim3 gDim( (m + bDim.z - 1) / bDim.z ); - HYPRE_CUDA_LAUNCH ( (hypre_spgemm_attempt), + HYPRE_GPU_LAUNCH ( (hypre_spgemm_attempt), gDim, bDim, /* shmem_size, */ m, NULL, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_js, d_as, d_ghash1_i, d_ghash1_j, d_ghash1_a, d_rc, d_rf ); @@ -537,7 +537,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int m, // for cases where one WARP works on a row dim3 gDim( (num_failed_rows + bDim.z - 1) / bDim.z ); - HYPRE_CUDA_LAUNCH ( (hypre_spgemm_attempt), + HYPRE_GPU_LAUNCH ( (hypre_spgemm_attempt), gDim, bDim, /* shmem_size, */ num_failed_rows, rf_ind, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_js, d_as, d_ghash2_i, d_ghash2_j, d_ghash2_a, d_rc, NULL ); @@ -557,7 +557,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int m, // for cases where one WARP works on a row dim3 gDim( (m + bDim.z - 1) / bDim.z ); - HYPRE_CUDA_LAUNCH( (hypre_spgemm_copy_from_hash_into_C), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_spgemm_copy_from_hash_into_C), gDim, bDim, m, d_rf, d_js, d_as, d_ghash1_i, d_ghash1_j, d_ghash1_a, diff --git a/src/seq_mv/csr_spgemm_device_confident.c b/src/seq_mv/csr_spgemm_device_confident.c index 86633323bc..a9b5a494df 100644 --- a/src/seq_mv/csr_spgemm_device_confident.c +++ b/src/seq_mv/csr_spgemm_device_confident.c @@ -461,7 +461,7 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int m, hypre_create_ija(m, d_rc, d_ic, &d_jc, &d_c, &nnzC_nume); - HYPRE_CUDA_LAUNCH ( (hypre_spgemm_numeric), + HYPRE_GPU_LAUNCH ( (hypre_spgemm_numeric), gDim, bDim, /* shmem_size, */ m, /* k, n, */ d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_ic, d_jc, d_c, d_rc, d_ghash_i, d_ghash_j, d_ghash_a ); @@ -486,7 +486,7 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int m, /* copy to the final C */ dim3 gDim( (m + bDim.z - 1) / bDim.z ); - HYPRE_CUDA_LAUNCH( (hypre_spgemm_copy_from_Cext_into_C), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_spgemm_copy_from_Cext_into_C), gDim, bDim, m, d_ic, d_jc, d_c, d_ic_new, d_jc_new, d_c_new ); hypre_TFree(d_ic, HYPRE_MEMORY_DEVICE); diff --git a/src/seq_mv/csr_spgemm_device_rowbound.c b/src/seq_mv/csr_spgemm_device_rowbound.c index d3dce3e62c..c2703eb00e 100644 --- a/src/seq_mv/csr_spgemm_device_rowbound.c +++ b/src/seq_mv/csr_spgemm_device_rowbound.c @@ -311,17 +311,17 @@ hypre_spgemm_rownnz_attempt(HYPRE_Int m, * ---------------------------------------------------------------------------*/ if (hash_type == 'L') { - HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic), gDim, bDim, m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf ); } else if (hash_type == 'Q') { - HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic), gDim, bDim, m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf ); } else if (hash_type == 'D') { - HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic), gDim, bDim, m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf ); } else diff --git a/src/seq_mv/csr_spgemm_device_rowest.c b/src/seq_mv/csr_spgemm_device_rowest.c index f058d1d7ed..d94744bcfc 100644 --- a/src/seq_mv/csr_spgemm_device_rowest.c +++ b/src/seq_mv/csr_spgemm_device_rowest.c @@ -284,11 +284,11 @@ void csr_spmm_rownnz_cohen(HYPRE_Int M, HYPRE_Int K, HYPRE_Int N, HYPRE_Int *d_i dim3 gDim( (nsamples * N + bDim.z * HYPRE_WARP_SIZE - 1) / (bDim.z * HYPRE_WARP_SIZE) ); - HYPRE_CUDA_LAUNCH( expdistfromuniform, gDim, bDim, nsamples * N, d_V1 ); + HYPRE_GPU_LAUNCH( expdistfromuniform, gDim, bDim, nsamples * N, d_V1 ); /* step-1: layer 3-2 */ gDim.x = (K + bDim.z - 1) / bDim.z; - HYPRE_CUDA_LAUNCH( (cohen_rowest_kernel), gDim, bDim, + HYPRE_GPU_LAUNCH( (cohen_rowest_kernel), gDim, bDim, K, d_ib, d_jb, d_V1, d_V2, NULL, nsamples, NULL, NULL, -1.0); //hypre_TFree(d_V1, HYPRE_MEMORY_DEVICE); @@ -297,7 +297,7 @@ void csr_spmm_rownnz_cohen(HYPRE_Int M, HYPRE_Int K, HYPRE_Int N, HYPRE_Int *d_i d_V3 = (T*) d_rc; gDim.x = (M + bDim.z - 1) / bDim.z; - HYPRE_CUDA_LAUNCH( (cohen_rowest_kernel), gDim, bDim, + HYPRE_GPU_LAUNCH( (cohen_rowest_kernel), gDim, bDim, M, d_ia, d_ja, d_V2, d_V3, d_rc, nsamples, d_low, d_upp, mult_factor); /* done */ @@ -331,13 +331,13 @@ hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n, if (row_est_mtd == 1) { /* naive overestimate */ - HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'U', num_warps_per_block>), gDim, bDim, + HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'U', num_warps_per_block>), gDim, bDim, m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, NULL, d_rc ); } else if (row_est_mtd == 2) { /* naive underestimate */ - HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'L', num_warps_per_block>), gDim, bDim, + HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'L', num_warps_per_block>), gDim, bDim, m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, d_rc, NULL ); } else if (row_est_mtd == 3) @@ -354,7 +354,7 @@ hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n, HYPRE_Int *d_low = d_low_upp; HYPRE_Int *d_upp = d_low_upp + m; - HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'B', num_warps_per_block>), gDim, bDim, + HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'B', num_warps_per_block>), gDim, bDim, m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, d_low, d_upp ); /* Cohen's algorithm, stochastic approach */ diff --git a/src/seq_mv/csr_spgemm_device_util.c b/src/seq_mv/csr_spgemm_device_util.c index 9514be1f1a..a3cf7cd951 100644 --- a/src/seq_mv/csr_spgemm_device_util.c +++ b/src/seq_mv/csr_spgemm_device_util.c @@ -103,14 +103,14 @@ hypre_SpGemmCreateGlobalHashTable( HYPRE_Int num_rows, /* number of { ghash_i = hypre_TAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE); dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_ghash, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim, num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE ); } else if (type == 2) { ghash_i = hypre_CTAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE); dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim, num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE ); } diff --git a/src/seq_mv/csr_spmv_device.c b/src/seq_mv/csr_spmv_device.c index ba0a185761..bfe691669e 100644 --- a/src/seq_mv/csr_spmv_device.c +++ b/src/seq_mv/csr_spmv_device.c @@ -170,7 +170,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows, const HYPRE_Int group_size = 32; const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size; const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block); - HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y ); } else if (rownnz >= 32) @@ -178,7 +178,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows, const HYPRE_Int group_size = 16; const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size; const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block); - HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y ); } else if (rownnz >= 16) @@ -186,7 +186,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows, const HYPRE_Int group_size = 8; const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size; const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block); - HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y ); } else if (rownnz >= 8) @@ -194,7 +194,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows, const HYPRE_Int group_size = 4; const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size; const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block); - HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y ); } else @@ -202,7 +202,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows, const HYPRE_Int group_size = 4; const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size; const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block); - HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y ); } diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index 5224d03cab..72edfc6527 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -797,7 +797,7 @@ BoxLoopforall( HYPRE_Int length, const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); - HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); + HYPRE_GPU_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); } } @@ -858,7 +858,7 @@ ReductionBoxLoopforall( HYPRE_Int length, hypre_printf("length= %d, blocksize = %d, gridsize = %d\n", length, bDim.x, gDim.x); */ - HYPRE_CUDA_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body ); + HYPRE_GPU_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body ); } } diff --git a/src/struct_mv/boxloop_cuda.h b/src/struct_mv/boxloop_cuda.h index cd477fe2eb..e78234c6d4 100644 --- a/src/struct_mv/boxloop_cuda.h +++ b/src/struct_mv/boxloop_cuda.h @@ -73,7 +73,7 @@ BoxLoopforall( HYPRE_Int length, const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); - HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); + HYPRE_GPU_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); } } @@ -134,7 +134,7 @@ ReductionBoxLoopforall( HYPRE_Int length, hypre_printf("length= %d, blocksize = %d, gridsize = %d\n", length, bDim.x, gDim.x); */ - HYPRE_CUDA_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body ); + HYPRE_GPU_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body ); } } diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index b8addbad0b..8c1ff91322 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -449,7 +449,7 @@ using namespace thrust::placeholders; #define GPU_LAUNCH_SYNC #endif // defined(HYPRE_DEBUG) -#define HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \ +#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \ { \ if ( gridsize.x == 0 || gridsize.y == 0 || gridsize.z == 0 || \ blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 ) \ @@ -460,22 +460,22 @@ using namespace thrust::placeholders; } \ else \ { \ - (kernel_name) <<< (gridsize), (blocksize), shmem_size, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ + (kernel_name) <<< (gridsize), (blocksize), shmem_size, hypre_DeviceDataComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ GPU_LAUNCH_SYNC; \ } \ } -#define HYPRE_CUDA_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__) +#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__) /* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right * The following one works OK for now */ #if defined(HYPRE_USING_CUDA) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::cuda::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::cuda::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_DeviceDataComputeStream(hypre_handle())), __VA_ARGS__); #elif defined(HYPRE_USING_HIP) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::hip::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::hip::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_DeviceDataComputeStream(hypre_handle())), __VA_ARGS__); #endif /* return the number of threads in block */ @@ -1040,6 +1040,451 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +//////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(HYPRE_USING_SYCL) + +#define PSTL_USE_PARALLEL_POLICIES 0 // for libstdc++ 9 +#define _GLIBCXX_USE_TBB_PAR_BACKEND 0 // for libstdc++ 10 + +// #include +// #include +// #include +// #include + +//#include // dpct::remove_if, remove_copy_if, copy_if + +// #include +// #include +// #include +// #include + +#define __forceinline__ __inline__ __attribute__((always_inline)) + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * macro for launching SYCL kernels, SYCL, oneDPL, oneMKL calls + * NOTE: IN HYPRE'S DEFAULT STREAM + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + */ + +#if defined(HYPRE_DEBUG) +#if defined(HYPRE_USING_CUDA) +#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } +#endif +#else // #if defined(HYPRE_DEBUG) +#define GPU_LAUNCH_SYNC +#endif // defined(HYPRE_DEBUG) + +#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) \ +{ \ + if ( gridsize[0] == 0 || blocksize[0] == 0 ) \ + { \ + hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \ + __FILE__, __LINE__, \ + gridsize[0], blocksize[0]); \ + assert(0); exit(1); \ + } \ + else \ + { \ + hypre_DeviceDataComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \ + [=] (sycl::nd_item<1> item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]] { \ + (kernel_name)(item, __VA_ARGS__); \ + }); \ + } \ +} + +/* RL: TODO Want macro HYPRE_ONEDPL_CALL to return value but I don't know how to do it right + * The following one works OK for now */ + +#define HYPRE_ONEDPL_CALL(func_name, ...) \ + func_name(oneapi::dpl::execution::make_device_policy(*hypre_DeviceDataComputeStream(hypre_handle()), __VA_ARGS__); + +// /* return the number of threads in block */ +// template +// static __forceinline__ +// hypre_int hypre_gpu_get_num_threads() +// { +// switch (dim) +// { +// case 1: +// return (blockDim.x); +// case 2: +// return (blockDim.x * blockDim.y); +// case 3: +// return (blockDim.x * blockDim.y * blockDim.z); +// } + +// return -1; +// } + +/* return the number of (sub_groups) warps in (work-group) block */ +template +static __forceinline__ +hypre_int hypre_gpu_get_num_warps(sycl::nd_item<1>& item) +{ + return item.get_sub_group().get_group_range().get(0); +} + +/* return the thread lane id in warp */ +template +static __forceinline__ +hypre_int hypre_gpu_get_lane_id(sycl::nd_item<1>& item) +{ + return item.get_local_linear_id() & (HYPRE_WARP_SIZE-1); +} + +// /* return the number of threads in grid */ +// template +// static __forceinline__ +// hypre_int hypre_gpu_get_grid_num_threads() +// { +// return hypre_gpu_get_num_blocks() * hypre_gpu_get_num_threads(); +// } + +/* return the flattened work-item/thread id in global work space */ +template +static __forceinline__ +hypre_int hypre_gpu_get_grid_thread_id(sycl::nd_item<1>& item) +{ + return item.get_global_id(0); +} + +// /* return the number of warps in grid */ +// template +// static __forceinline__ +// hypre_int hypre_gpu_get_grid_num_warps() +// { +// return hypre_gpu_get_num_blocks() * hypre_gpu_get_num_warps(); +// } + +/* return the flattened warp id in grid */ +template +static __forceinline__ +hypre_int hypre_gpu_get_grid_warp_id(sycl::nd_item<1>& item) +{ + return item.get_group(0) * hypre_gpu_get_num_warps(item) + + item.get_sub_group().get_group_linear_id(); +} + +// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +// static __forceinline__ +// hypre_double atomicAdd(hypre_double* address, hypre_double val) +// { +// hypre_ulonglongint* address_as_ull = (hypre_ulonglongint*) address; +// hypre_ulonglongint old = *address_as_ull, assumed; + +// do { +// assumed = old; +// old = atomicCAS(address_as_ull, assumed, +// __double_as_longlong(val + +// __longlong_as_double(assumed))); + +// // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) +// } while (assumed != old); + +// return __longlong_as_double(old); +// } +// #endif + +template +static __forceinline__ +T read_only_load( const T *ptr ) +{ + return *ptr; +} + +// /* exclusive prefix scan */ +// template +// static __forceinline__ +// T warp_prefix_sum(hypre_int lane_id, T in, T &all_sum) +// { +// #pragma unroll +// for (hypre_int d = 2; d <=HYPRE_WARP_SIZE; d <<= 1) +// { +// T t = __shfl_up_sync(HYPRE_WARP_FULL_MASK, in, d >> 1); +// if ( (lane_id & (d - 1)) == (d - 1) ) +// { +// in += t; +// } +// } + +// all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, HYPRE_WARP_SIZE-1); + +// if (lane_id == HYPRE_WARP_SIZE-1) +// { +// in = 0; +// } + +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// T t = __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d); + +// if ( (lane_id & (d - 1)) == (d - 1)) +// { +// if ( (lane_id & ((d << 1) - 1)) == ((d << 1) - 1) ) +// { +// in += t; +// } +// else +// { +// in = t; +// } +// } +// } +// return in; +// } + +template +static __forceinline__ +T warp_reduce_sum(T in, sycl::nd_item<1>& item) +{ + sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + //sycl::ext::oneapi::reduce(SG, in, std::plus()); +#pragma unroll + for (hypre_int d = SG.get_local_range().get(0)/2; d > 0; d >>= 1) + { + in += SG.shuffle_down(in, d); + } + return in; +} + +// template +// static __forceinline__ +// T warp_allreduce_sum(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in += __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_reduce_max(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = max(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_allreduce_max(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = max(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_reduce_min(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = min(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_allreduce_min(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = min(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// static __forceinline__ +// hypre_int next_power_of_2(hypre_int n) +// { +// if (n <= 0) +// { +// return 0; +// } + +// /* if n is power of 2, return itself */ +// if ( (n & (n - 1)) == 0 ) +// { +// return n; +// } + +// n |= (n >> 1); +// n |= (n >> 2); +// n |= (n >> 4); +// n |= (n >> 8); +// n |= (n >> 16); +// n ^= (n >> 1); +// n = (n << 1); + +// return n; +// } + +// template +// struct absolute_value : public thrust::unary_function +// { +// T operator()(const T &x) const +// { +// return x < T(0) ? -x : x; +// } +// }; + +// template +// struct TupleComp2 +// { +// typedef thrust::tuple Tuple; + +// bool operator()(const Tuple& t1, const Tuple& t2) +// { +// if (thrust::get<0>(t1) < thrust::get<0>(t2)) +// { +// return true; +// } +// if (thrust::get<0>(t1) > thrust::get<0>(t2)) +// { +// return false; +// } +// return hypre_abs(thrust::get<1>(t1)) > hypre_abs(thrust::get<1>(t2)); +// } +// }; + +// template +// struct TupleComp3 +// { +// typedef thrust::tuple Tuple; + +// bool operator()(const Tuple& t1, const Tuple& t2) +// { +// if (thrust::get<0>(t1) < thrust::get<0>(t2)) +// { +// return true; +// } +// if (thrust::get<0>(t1) > thrust::get<0>(t2)) +// { +// return false; +// } +// if (thrust::get<0>(t2) == thrust::get<1>(t2)) +// { +// return false; +// } +// return thrust::get<0>(t1) == thrust::get<1>(t1) || thrust::get<1>(t1) < thrust::get<1>(t2); +// } +// }; + +// template +// struct is_negative : public thrust::unary_function +// { +// bool operator()(const T &x) +// { +// return (x < 0); +// } +// }; + +// template +// struct is_positive : public thrust::unary_function +// { +// bool operator()(const T &x) +// { +// return (x > 0); +// } +// }; + +// template +// struct is_nonnegative : public thrust::unary_function +// { +// bool operator()(const T &x) +// { +// return (x >= 0); +// } +// }; + +template +struct in_range : public std::unary_function +{ + T low, up; + + in_range(T low_, T up_) { low = low_; up = up_; } + + bool operator()(const T &x) const + { + return (x >= low && x <= up); + } +}; + +// template +// struct out_of_range : public thrust::unary_function +// { +// T low, up; + +// out_of_range(T low_, T up_) { low = low_; up = up_; } + +// bool operator()(const T &x) +// { +// return (x < low || x > up); +// } +// }; + +template +struct less_than : std::unary_function +{ + T val; + less_than(T val_) { val = val_; } + + bool operator()(const T &x) const { return (x < val); } +}; + +// template +// struct modulo : public thrust::unary_function +// { +// T val; + +// modulo(T val_) { val = val_; } + +// T operator()(const T &x) +// { +// return (x % val); +// } +// }; + +// template +// struct equal : public thrust::unary_function +// { +// T val; + +// equal(T val_) { val = val_; } + +// bool operator()(const T &x) +// { +// return (x == val); +// } +// }; + +// struct print_functor +// { +// void operator()(HYPRE_Real val) +// { +// printf("%f\n", val); +// } +// }; + +#endif // #if defined(HYPRE_USING_SYCL) + +//////////////////////////////////////////////////////////////////////////////////////// + #if defined(HYPRE_USING_CUSPARSE) cudaDataType hypre_HYPREComplexToCudaDataType(); @@ -1315,7 +1760,7 @@ struct ReduceSum /* 2nd reduction with only *one* block */ hypre_assert(nblocks >= 0 && nblocks <= 1024); const dim3 gDim(1), bDim(1024); - HYPRE_CUDA_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks ); + HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks ); hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); val += init; } diff --git a/src/utilities/device_reducer.h b/src/utilities/device_reducer.h index 729bbce535..8953dec5d3 100644 --- a/src/utilities/device_reducer.h +++ b/src/utilities/device_reducer.h @@ -264,7 +264,7 @@ struct ReduceSum /* 2nd reduction with only *one* block */ hypre_assert(nblocks >= 0 && nblocks <= 1024); const dim3 gDim(1), bDim(1024); - HYPRE_CUDA_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks ); + HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks ); hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); val += init; } diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index b1bb63252b..d108ba9041 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -83,7 +83,7 @@ void hypre_CudaCompileFlagCheck() //cuda_arch_compile_d = hypre_TAlloc(hypre_int, 1, HYPRE_MEMORY_DEVICE); HYPRE_CUDA_CALL( cudaMalloc(&cuda_arch_compile_d, sizeof(hypre_int)) ); hypre_TMemcpy(cuda_arch_compile_d, &cuda_arch_compile, hypre_int, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d ); hypre_TMemcpy(&cuda_arch_compile, cuda_arch_compile_d, hypre_int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); //hypre_TFree(cuda_arch_compile_d, HYPRE_MEMORY_DEVICE); HYPRE_CUDA_CALL( cudaFree(cuda_arch_compile_d) ); @@ -190,7 +190,7 @@ hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_di return hypre_error_flag; } - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia, d_offd_ia, d_rownnz ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia, d_offd_ia, d_rownnz ); return hypre_error_flag; } @@ -329,7 +329,7 @@ hypreDevice_CopyParCSRRows(HYPRE_Int nrows, } */ - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim, nrows, d_row_indices, has_offd, first_col, d_col_map_offd_A, d_diag_i, d_diag_j, d_diag_a, d_offd_i, d_offd_j, d_offd_a, @@ -533,7 +533,7 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea /* trivial cases, n = 1, 2 */ dim3 bDim = 1; dim3 gDim = 1; - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y ); } else { @@ -572,7 +572,7 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(reduced_n, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim, reduced_n, x, reduced_map, reduced_y ); if (!work) @@ -615,7 +615,7 @@ hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v) dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v ); return hypre_error_flag; } @@ -647,7 +647,7 @@ hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Comple dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y ); return hypre_error_flag; } @@ -679,7 +679,7 @@ hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_ dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val ); return hypre_error_flag; } @@ -716,7 +716,7 @@ hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y ); return hypre_error_flag; } @@ -749,7 +749,7 @@ hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z ); return hypre_error_flag; } @@ -773,7 +773,7 @@ hypreDevice_BigToSmallCopy(HYPRE_Int *tgt, const HYPRE_BigInt *src, HYPRE_Int si dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(size, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size); return hypre_error_flag; } @@ -1231,7 +1231,7 @@ hypre_DeviceDataCreate() /* WM: does the default selector get a GPU if available? Having trouble with getting the device on frank, so temporarily just passing the default selector */ hypre_DeviceDataDevice(data) = nullptr; - hypre_DeviceDataDeviceMaxWorkGroupSize(data) = hypre_DeviceDataDevice(data).get_info(); + hypre_DeviceDataDeviceMaxWorkGroupSize(data) = hypre_DeviceDataDevice(data)->get_info(); #else hypre_DeviceDataDevice(data) = 0; #endif diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index e4e137ca14..96d14a1435 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -392,7 +392,7 @@ using namespace thrust::placeholders; #define GPU_LAUNCH_SYNC #endif // defined(HYPRE_DEBUG) -#define HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \ +#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \ { \ if ( gridsize.x == 0 || gridsize.y == 0 || gridsize.z == 0 || \ blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 ) \ @@ -403,22 +403,22 @@ using namespace thrust::placeholders; } \ else \ { \ - (kernel_name) <<< (gridsize), (blocksize), shmem_size, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ + (kernel_name) <<< (gridsize), (blocksize), shmem_size, hypre_DeviceDataComputeStream(hypre_handle()) >>> (__VA_ARGS__); \ GPU_LAUNCH_SYNC; \ } \ } -#define HYPRE_CUDA_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__) +#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__) /* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right * The following one works OK for now */ #if defined(HYPRE_USING_CUDA) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::cuda::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::cuda::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_DeviceDataComputeStream(hypre_handle())), __VA_ARGS__); #elif defined(HYPRE_USING_HIP) #define HYPRE_THRUST_CALL(func_name, ...) \ - thrust::func_name(thrust::hip::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__); + thrust::func_name(thrust::hip::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_DeviceDataComputeStream(hypre_handle())), __VA_ARGS__); #endif /* return the number of threads in block */ @@ -983,6 +983,451 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +//////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(HYPRE_USING_SYCL) + +#define PSTL_USE_PARALLEL_POLICIES 0 // for libstdc++ 9 +#define _GLIBCXX_USE_TBB_PAR_BACKEND 0 // for libstdc++ 10 + +// #include +// #include +// #include +// #include + +//#include // dpct::remove_if, remove_copy_if, copy_if + +// #include +// #include +// #include +// #include + +#define __forceinline__ __inline__ __attribute__((always_inline)) + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * macro for launching SYCL kernels, SYCL, oneDPL, oneMKL calls + * NOTE: IN HYPRE'S DEFAULT STREAM + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + */ + +#if defined(HYPRE_DEBUG) +#if defined(HYPRE_USING_CUDA) +#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } +#endif +#else // #if defined(HYPRE_DEBUG) +#define GPU_LAUNCH_SYNC +#endif // defined(HYPRE_DEBUG) + +#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) \ +{ \ + if ( gridsize[0] == 0 || blocksize[0] == 0 ) \ + { \ + hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \ + __FILE__, __LINE__, \ + gridsize[0], blocksize[0]); \ + assert(0); exit(1); \ + } \ + else \ + { \ + hypre_DeviceDataComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \ + [=] (sycl::nd_item<1> item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]] { \ + (kernel_name)(item, __VA_ARGS__); \ + }); \ + } \ +} + +/* RL: TODO Want macro HYPRE_ONEDPL_CALL to return value but I don't know how to do it right + * The following one works OK for now */ + +#define HYPRE_ONEDPL_CALL(func_name, ...) \ + func_name(oneapi::dpl::execution::make_device_policy(*hypre_DeviceDataComputeStream(hypre_handle()), __VA_ARGS__); + +// /* return the number of threads in block */ +// template +// static __forceinline__ +// hypre_int hypre_gpu_get_num_threads() +// { +// switch (dim) +// { +// case 1: +// return (blockDim.x); +// case 2: +// return (blockDim.x * blockDim.y); +// case 3: +// return (blockDim.x * blockDim.y * blockDim.z); +// } + +// return -1; +// } + +/* return the number of (sub_groups) warps in (work-group) block */ +template +static __forceinline__ +hypre_int hypre_gpu_get_num_warps(sycl::nd_item<1>& item) +{ + return item.get_sub_group().get_group_range().get(0); +} + +/* return the thread lane id in warp */ +template +static __forceinline__ +hypre_int hypre_gpu_get_lane_id(sycl::nd_item<1>& item) +{ + return item.get_local_linear_id() & (HYPRE_WARP_SIZE-1); +} + +// /* return the number of threads in grid */ +// template +// static __forceinline__ +// hypre_int hypre_gpu_get_grid_num_threads() +// { +// return hypre_gpu_get_num_blocks() * hypre_gpu_get_num_threads(); +// } + +/* return the flattened work-item/thread id in global work space */ +template +static __forceinline__ +hypre_int hypre_gpu_get_grid_thread_id(sycl::nd_item<1>& item) +{ + return item.get_global_id(0); +} + +// /* return the number of warps in grid */ +// template +// static __forceinline__ +// hypre_int hypre_gpu_get_grid_num_warps() +// { +// return hypre_gpu_get_num_blocks() * hypre_gpu_get_num_warps(); +// } + +/* return the flattened warp id in grid */ +template +static __forceinline__ +hypre_int hypre_gpu_get_grid_warp_id(sycl::nd_item<1>& item) +{ + return item.get_group(0) * hypre_gpu_get_num_warps(item) + + item.get_sub_group().get_group_linear_id(); +} + +// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +// static __forceinline__ +// hypre_double atomicAdd(hypre_double* address, hypre_double val) +// { +// hypre_ulonglongint* address_as_ull = (hypre_ulonglongint*) address; +// hypre_ulonglongint old = *address_as_ull, assumed; + +// do { +// assumed = old; +// old = atomicCAS(address_as_ull, assumed, +// __double_as_longlong(val + +// __longlong_as_double(assumed))); + +// // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) +// } while (assumed != old); + +// return __longlong_as_double(old); +// } +// #endif + +template +static __forceinline__ +T read_only_load( const T *ptr ) +{ + return *ptr; +} + +// /* exclusive prefix scan */ +// template +// static __forceinline__ +// T warp_prefix_sum(hypre_int lane_id, T in, T &all_sum) +// { +// #pragma unroll +// for (hypre_int d = 2; d <=HYPRE_WARP_SIZE; d <<= 1) +// { +// T t = __shfl_up_sync(HYPRE_WARP_FULL_MASK, in, d >> 1); +// if ( (lane_id & (d - 1)) == (d - 1) ) +// { +// in += t; +// } +// } + +// all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, HYPRE_WARP_SIZE-1); + +// if (lane_id == HYPRE_WARP_SIZE-1) +// { +// in = 0; +// } + +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// T t = __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d); + +// if ( (lane_id & (d - 1)) == (d - 1)) +// { +// if ( (lane_id & ((d << 1) - 1)) == ((d << 1) - 1) ) +// { +// in += t; +// } +// else +// { +// in = t; +// } +// } +// } +// return in; +// } + +template +static __forceinline__ +T warp_reduce_sum(T in, sycl::nd_item<1>& item) +{ + sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + //sycl::ext::oneapi::reduce(SG, in, std::plus()); +#pragma unroll + for (hypre_int d = SG.get_local_range().get(0)/2; d > 0; d >>= 1) + { + in += SG.shuffle_down(in, d); + } + return in; +} + +// template +// static __forceinline__ +// T warp_allreduce_sum(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in += __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_reduce_max(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = max(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_allreduce_max(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = max(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_reduce_min(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = min(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_allreduce_min(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = min(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// static __forceinline__ +// hypre_int next_power_of_2(hypre_int n) +// { +// if (n <= 0) +// { +// return 0; +// } + +// /* if n is power of 2, return itself */ +// if ( (n & (n - 1)) == 0 ) +// { +// return n; +// } + +// n |= (n >> 1); +// n |= (n >> 2); +// n |= (n >> 4); +// n |= (n >> 8); +// n |= (n >> 16); +// n ^= (n >> 1); +// n = (n << 1); + +// return n; +// } + +// template +// struct absolute_value : public thrust::unary_function +// { +// T operator()(const T &x) const +// { +// return x < T(0) ? -x : x; +// } +// }; + +// template +// struct TupleComp2 +// { +// typedef thrust::tuple Tuple; + +// bool operator()(const Tuple& t1, const Tuple& t2) +// { +// if (thrust::get<0>(t1) < thrust::get<0>(t2)) +// { +// return true; +// } +// if (thrust::get<0>(t1) > thrust::get<0>(t2)) +// { +// return false; +// } +// return hypre_abs(thrust::get<1>(t1)) > hypre_abs(thrust::get<1>(t2)); +// } +// }; + +// template +// struct TupleComp3 +// { +// typedef thrust::tuple Tuple; + +// bool operator()(const Tuple& t1, const Tuple& t2) +// { +// if (thrust::get<0>(t1) < thrust::get<0>(t2)) +// { +// return true; +// } +// if (thrust::get<0>(t1) > thrust::get<0>(t2)) +// { +// return false; +// } +// if (thrust::get<0>(t2) == thrust::get<1>(t2)) +// { +// return false; +// } +// return thrust::get<0>(t1) == thrust::get<1>(t1) || thrust::get<1>(t1) < thrust::get<1>(t2); +// } +// }; + +// template +// struct is_negative : public thrust::unary_function +// { +// bool operator()(const T &x) +// { +// return (x < 0); +// } +// }; + +// template +// struct is_positive : public thrust::unary_function +// { +// bool operator()(const T &x) +// { +// return (x > 0); +// } +// }; + +// template +// struct is_nonnegative : public thrust::unary_function +// { +// bool operator()(const T &x) +// { +// return (x >= 0); +// } +// }; + +template +struct in_range : public std::unary_function +{ + T low, up; + + in_range(T low_, T up_) { low = low_; up = up_; } + + bool operator()(const T &x) const + { + return (x >= low && x <= up); + } +}; + +// template +// struct out_of_range : public thrust::unary_function +// { +// T low, up; + +// out_of_range(T low_, T up_) { low = low_; up = up_; } + +// bool operator()(const T &x) +// { +// return (x < low || x > up); +// } +// }; + +template +struct less_than : std::unary_function +{ + T val; + less_than(T val_) { val = val_; } + + bool operator()(const T &x) const { return (x < val); } +}; + +// template +// struct modulo : public thrust::unary_function +// { +// T val; + +// modulo(T val_) { val = val_; } + +// T operator()(const T &x) +// { +// return (x % val); +// } +// }; + +// template +// struct equal : public thrust::unary_function +// { +// T val; + +// equal(T val_) { val = val_; } + +// bool operator()(const T &x) +// { +// return (x == val); +// } +// }; + +// struct print_functor +// { +// void operator()(HYPRE_Real val) +// { +// printf("%f\n", val); +// } +// }; + +#endif // #if defined(HYPRE_USING_SYCL) + +//////////////////////////////////////////////////////////////////////////////////////// + #if defined(HYPRE_USING_CUSPARSE) cudaDataType hypre_HYPREComplexToCudaDataType(); diff --git a/src/utilities/general.c b/src/utilities/general.c index 0aed7d5252..2cfec7ab23 100644 --- a/src/utilities/general.c +++ b/src/utilities/general.c @@ -106,6 +106,8 @@ hypre_SetDevice(hypre_int device_id, hypre_Handle *hypre_handle_) hypre_printf("ERROR: SYCL device-ID exceed the number of devices on-node... \n"); } + sycl::platform platform(sycl::gpu_selector{}); + auto gpu_devices = platform.get_devices(sycl::info::device_type::gpu); HYPRE_Int local_nDevices=0; for (int i = 0; i < gpu_devices.size(); i++) { // multi-tile GPUs From 35fa901d14c77be76efc1b1919bf93a400b64d8c Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Wed, 8 Dec 2021 19:57:24 +0000 Subject: [PATCH 35/44] [SYCL] changes to function, var names from _cuda_ to _device_ for unified --- .../distributed_matrix_parcsr.c | 2 +- src/parcsr_ls/ams.c | 2 +- src/parcsr_ls/par_relax.c | 8 +-- src/parcsr_ls/par_relax_more_device.c | 4 +- src/parcsr_mv/par_csr_communication.c | 2 +- src/parcsr_mv/par_csr_matop.c | 2 +- src/parcsr_mv/par_csr_matop_device.c | 6 +-- src/parcsr_mv/par_csr_matvec.c | 16 +++--- src/parcsr_mv/par_csr_triplemat_device.c | 4 +- src/seq_mv/csr_matop_device.c | 46 ++++++++-------- src/seq_mv/csr_matvec_device.c | 4 +- src/seq_mv/csr_spgemm_device.c | 14 ++--- src/seq_mv/csr_sptrans_device.c | 2 +- src/seq_mv/vector.c | 14 ++--- src/sstruct_mv/sstruct_matrix.c | 2 +- src/sstruct_mv/sstruct_vector.c | 2 +- src/test/ij.c | 10 ++-- src/test/ij_assembly.c | 20 +++---- src/test/ij_mm.c | 6 +-- src/utilities/_hypre_utilities.h | 12 ++--- src/utilities/_hypre_utilities.hpp | 6 +-- src/utilities/device_utils.c | 52 ++++++++++--------- src/utilities/device_utils.h | 6 +-- src/utilities/int_array.c | 2 +- src/utilities/protos.h | 12 ++--- 25 files changed, 130 insertions(+), 126 deletions(-) diff --git a/src/distributed_matrix/distributed_matrix_parcsr.c b/src/distributed_matrix/distributed_matrix_parcsr.c index 0df9ae59e8..e6d986dddb 100644 --- a/src/distributed_matrix/distributed_matrix_parcsr.c +++ b/src/distributed_matrix/distributed_matrix_parcsr.c @@ -102,7 +102,7 @@ hypre_DistributedMatrixGetRowParCSR( hypre_DistributedMatrix *matrix, // RL: if HYPRE_ParCSRMatrixGetRow was on device, need the next line to guarantee it's done #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif return(ierr); diff --git a/src/parcsr_ls/ams.c b/src/parcsr_ls/ams.c index 25549b54e9..9a90c0a71c 100644 --- a/src/parcsr_ls/ams.c +++ b/src/parcsr_ls/ams.c @@ -459,7 +459,7 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A) HYPRE_GPU_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim, nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd); - //hypre_SyncCudaComputeStream(hypre_handle()); + //hypre_SyncDeviceComputeStream(hypre_handle()); return hypre_error_flag; } diff --git a/src/parcsr_ls/par_relax.c b/src/parcsr_ls/par_relax.c index 608bc4209d..63d6b7df03 100644 --- a/src/parcsr_ls/par_relax.c +++ b/src/parcsr_ls/par_relax.c @@ -1117,8 +1117,8 @@ hypre_BoomerAMGRelax7Jacobi( hypre_ParCSRMatrix *A, #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) HYPRE_Int sync_stream; - hypre_GetSyncCudaCompute(&sync_stream); - hypre_SetSyncCudaCompute(0); + hypre_GetSyncDeviceCompute(&sync_stream); + hypre_SetSyncDeviceCompute(0); #endif /*----------------------------------------------------------------- @@ -1144,8 +1144,8 @@ hypre_BoomerAMGRelax7Jacobi( hypre_ParCSRMatrix *A, } #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - hypre_SetSyncCudaCompute(sync_stream); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SetSyncDeviceCompute(sync_stream); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif return hypre_error_flag; diff --git a/src/parcsr_ls/par_relax_more_device.c b/src/parcsr_ls/par_relax_more_device.c index c3cf1ce9fb..3388da1f82 100644 --- a/src/parcsr_ls/par_relax_more_device.c +++ b/src/parcsr_ls/par_relax_more_device.c @@ -169,7 +169,7 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A, rowsums_upper, scale); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); e_min = HYPRE_THRUST_CALL(reduce, rowsums_lower, rowsums_lower + A_num_rows, (HYPRE_Real)0, thrust::minimum()); @@ -323,7 +323,7 @@ hypre_ParCSRMaxEigEstimateCGDevice(hypre_ParCSRMatrix *A, /* matrix to relax /* set residual to random */ hypre_CurandUniform(local_size, r_data, 0, 0, 0, 0); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_THRUST_CALL(transform, r_data, r_data + local_size, r_data, diff --git a/src/parcsr_mv/par_csr_communication.c b/src/parcsr_mv/par_csr_communication.c index 35fef28c8d..9786d21d31 100644 --- a/src/parcsr_mv/par_csr_communication.c +++ b/src/parcsr_mv/par_csr_communication.c @@ -434,7 +434,7 @@ hypre_ParCSRCommHandleCreate_v2 ( HYPRE_Int job, recv_data = recv_data_in; // TODO RL: it seems that we need to sync the CUDA stream before doing GPU-GPU MPI. // Need to check MPI documentation whether this is acutally true - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif num_requests = num_sends + num_recvs; diff --git a/src/parcsr_mv/par_csr_matop.c b/src/parcsr_mv/par_csr_matop.c index 8eeb6dcf4c..97552f4aa1 100644 --- a/src/parcsr_mv/par_csr_matop.c +++ b/src/parcsr_mv/par_csr_matop.c @@ -4113,7 +4113,7 @@ hypre_ParTMatmul( hypre_ParCSRMatrix *A, if ( hypre_GetExecPolicy2(memory_location_A, memory_location_B) == HYPRE_EXEC_DEVICE ) { hypre_CSRMatrixMoveDiagFirstDevice(hypre_ParCSRMatrixDiag(C)); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); } #endif diff --git a/src/parcsr_mv/par_csr_matop_device.c b/src/parcsr_mv/par_csr_matop_device.c index 31bb4afb89..992dea4964 100644 --- a/src/parcsr_mv/par_csr_matop_device.c +++ b/src/parcsr_mv/par_csr_matop_device.c @@ -306,7 +306,7 @@ hypre_MergeDiagAndOffdDevice(hypre_ParCSRMatrix *A) hypre_CSRMatrixData(B) = B_a; hypre_CSRMatrixMemoryLocation(B) = HYPRE_MEMORY_DEVICE; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return B; } @@ -1044,7 +1044,7 @@ hypre_ParCSRMatrixGetRowDevice( hypre_ParCSRMatrix *mat, *values = hypre_ParCSRMatrixRowvalues(mat); } - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return hypre_error_flag; } @@ -1603,7 +1603,7 @@ hypre_ParCSRDiagScale( HYPRE_ParCSRMatrix HA, HYPRE_Int ierr = 0; #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypreDevice_DiagScaleVector(local_size, A_i, A_data, y_data, 0.0, x_data); - //hypre_SyncCudaComputeStream(hypre_handle()); + //hypre_SyncDeviceComputeStream(hypre_handle()); #else /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ HYPRE_Int i; #if defined(HYPRE_USING_DEVICE_OPENMP) diff --git a/src/parcsr_mv/par_csr_matvec.c b/src/parcsr_mv/par_csr_matvec.c index 30921fe960..d53f74a9d8 100644 --- a/src/parcsr_mv/par_csr_matvec.c +++ b/src/parcsr_mv/par_csr_matvec.c @@ -56,8 +56,8 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, #if defined(HYPRE_USING_GPU) HYPRE_Int sync_stream; - hypre_GetSyncCudaCompute(&sync_stream); - hypre_SetSyncCudaCompute(0); + hypre_GetSyncDeviceCompute(&sync_stream); + hypre_SetSyncDeviceCompute(0); #endif HYPRE_ANNOTATE_FUNC_BEGIN; @@ -348,8 +348,8 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, } #if defined(HYPRE_USING_GPU) - hypre_SetSyncCudaCompute(sync_stream); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SetSyncDeviceCompute(sync_stream); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -415,8 +415,8 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha, #if defined(HYPRE_USING_GPU) HYPRE_Int sync_stream; - hypre_GetSyncCudaCompute(&sync_stream); - hypre_SetSyncCudaCompute(0); + hypre_GetSyncDeviceCompute(&sync_stream); + hypre_SetSyncDeviceCompute(0); #endif HYPRE_ANNOTATE_FUNC_BEGIN; @@ -724,8 +724,8 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha, } #if defined(HYPRE_USING_GPU) - hypre_SetSyncCudaCompute(sync_stream); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SetSyncDeviceCompute(sync_stream); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE diff --git a/src/parcsr_mv/par_csr_triplemat_device.c b/src/parcsr_mv/par_csr_triplemat_device.c index 0b8a67fd63..5c77572e04 100644 --- a/src/parcsr_mv/par_csr_triplemat_device.c +++ b/src/parcsr_mv/par_csr_triplemat_device.c @@ -497,7 +497,7 @@ hypre_ParCSRTMatMatKTDevice( hypre_ParCSRMatrix *A, hypre_assert(!hypre_CSRMatrixCheckDiagFirstDevice(hypre_ParCSRMatrixDiag(C))); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return C; } @@ -817,7 +817,7 @@ hypre_ParCSRMatrixRAPKTDevice( hypre_ParCSRMatrix *R, hypre_assert(!hypre_CSRMatrixCheckDiagFirstDevice(hypre_ParCSRMatrixDiag(C))); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return C; } diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c index 0410ff7474..ff79ec97f4 100644 --- a/src/seq_mv/csr_matop_device.c +++ b/src/seq_mv/csr_matop_device.c @@ -927,7 +927,7 @@ hypre_CSRMatrixAddDevice ( HYPRE_Complex alpha, hypre_CSRMatrixData(C) = C_data; hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return C; } @@ -950,7 +950,7 @@ hypre_CSRMatrixMultiplyDevice( hypre_CSRMatrix *A, hypreDevice_CSRSpGemm(A, B, &C); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return C; } @@ -1100,7 +1100,7 @@ hypre_CSRMatrixSplitDevice( hypre_CSRMatrix *B_ext, *B_ext_diag_ptr = B_ext_diag; *B_ext_offd_ptr = B_ext_offd; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return ierr; } @@ -1331,7 +1331,7 @@ hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A, hypre_CSRMatrixData(C) = C_data; hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return C; } @@ -1373,7 +1373,7 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A, hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE); hypre_TFree(reduced_col_nnz, HYPRE_MEMORY_DEVICE); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return hypre_error_flag; } @@ -1394,7 +1394,7 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim, nrows, A_i, A_j, A_data); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return hypre_error_flag; } @@ -1421,7 +1421,7 @@ hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A ) hypre_TFree(result, HYPRE_MEMORY_DEVICE); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return ierr; } @@ -1466,7 +1466,7 @@ hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A, hypre_TFree(result, HYPRE_MEMORY_DEVICE); #endif - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return ierr; } @@ -1506,7 +1506,7 @@ hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, hypre_TFree(result, HYPRE_MEMORY_DEVICE); #endif - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return ierr; } @@ -1629,7 +1629,7 @@ hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, row_sum, scal, set_or_add[0] == 's' ); } - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); } void @@ -1648,7 +1648,7 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A, HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type ); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); } /* return C = [A; B] */ @@ -1907,7 +1907,7 @@ hypre_CSRMatrixAddDevice ( HYPRE_Complex alpha, hypre_CSRMatrixData(C) = C_data; hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return C; } @@ -1930,7 +1930,7 @@ hypre_CSRMatrixMultiplyDevice( hypre_CSRMatrix *A, hypreDevice_CSRSpGemm(A, B, &C); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return C; } @@ -2080,7 +2080,7 @@ hypre_CSRMatrixSplitDevice( hypre_CSRMatrix *B_ext, *B_ext_diag_ptr = B_ext_diag; *B_ext_offd_ptr = B_ext_offd; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return ierr; } @@ -2308,7 +2308,7 @@ hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A, hypre_CSRMatrixData(C) = C_data; hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return C; } @@ -2354,7 +2354,7 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A, hypre_TFree(reduced_col_nnz, HYPRE_MEMORY_DEVICE); hypre_TFree(values, HYPRE_MEMORY_UNIFIED); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return hypre_error_flag; } @@ -2375,7 +2375,7 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim, nrows, A_i, A_j, A_data); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return hypre_error_flag; } @@ -2402,7 +2402,7 @@ hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A ) hypre_TFree(result, HYPRE_MEMORY_DEVICE); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return ierr; } @@ -2447,7 +2447,7 @@ hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A, hypre_TFree(result, HYPRE_MEMORY_DEVICE); #endif - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return ierr; } @@ -2487,7 +2487,7 @@ hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, hypre_TFree(result, HYPRE_MEMORY_DEVICE); #endif - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return ierr; } @@ -2598,7 +2598,7 @@ hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, row_sum, scal, set_or_add[0] == 's' ); } - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); } void @@ -2617,7 +2617,7 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A, HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type ); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); } /* return C = [A; B] */ @@ -2878,7 +2878,7 @@ hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix *A, *AT_ptr = C; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return hypre_error_flag; } diff --git a/src/seq_mv/csr_matvec_device.c b/src/seq_mv/csr_matvec_device.c index 811040a510..8b61018ccd 100644 --- a/src/seq_mv/csr_matvec_device.c +++ b/src/seq_mv/csr_matvec_device.c @@ -117,7 +117,7 @@ hypre_CSRMatrixMatvecDevice( HYPRE_Int trans, hypre_CSRMatrixMatvecDevice2(trans, alpha, A, x, beta, y, offset); } - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_GpuProfilingPopRange(); @@ -201,7 +201,7 @@ hypre_CSRMatrixMatvecCusparseNewAPI( HYPRE_Int trans, #endif dBuffer) ); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); if (trans) { diff --git a/src/seq_mv/csr_spgemm_device.c b/src/seq_mv/csr_spgemm_device.c index 7d44c2cd05..b4074dadb9 100644 --- a/src/seq_mv/csr_spgemm_device.c +++ b/src/seq_mv/csr_spgemm_device.c @@ -89,7 +89,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, #endif hypreDevice_CSRSpGemmRownnz(m, k, n, d_ia, d_ja, d_ib, d_jb, 0 /* without input rc */, d_rc); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("Rownnz time %f\n", t2); #endif @@ -101,7 +101,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, 1 /* exact row nnz */, &d_ic, &d_jc, &d_c, &nnzC); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("SpGemmNumerical time %f\n", t2); #endif @@ -115,7 +115,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, #endif hypreDevice_CSRSpGemmRownnzEstimate(m, k, n, d_ia, d_ja, d_ib, d_jb, d_rc); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("RownnzEst time %f\n", t2); #endif @@ -126,7 +126,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, hypreDevice_CSRSpGemmNumerWithRownnzEstimate(m, k, n, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_rc, &d_ic, &d_jc, &d_c, &nnzC); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("SpGemmNumerical time %f\n", t2); #endif @@ -140,7 +140,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, #endif hypreDevice_CSRSpGemmRownnzEstimate(m, k, n, d_ia, d_ja, d_ib, d_jb, d_rc); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("RownnzEst time %f\n", t2); #endif @@ -157,7 +157,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, d_rc + 2 * m, thrust::identity() ); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("RownnzBound time %f\n", t2); #endif @@ -169,7 +169,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, rownnz_exact, &d_ic, &d_jc, &d_c, &nnzC); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("SpGemmNumerical time %f\n", t2); #endif diff --git a/src/seq_mv/csr_sptrans_device.c b/src/seq_mv/csr_sptrans_device.c index 548665ed2e..bd85778a03 100644 --- a/src/seq_mv/csr_sptrans_device.c +++ b/src/seq_mv/csr_sptrans_device.c @@ -137,7 +137,7 @@ hypreDevice_CSRSpTransRocsparse(HYPRE_Int m, HYPRE_Int n, HYPR *d_ac_out = csc_a; #ifdef HYPRE_PROFILE - hypre_SyncCudaDevice(hypre_handle()) + hypre_SyncDevice(hypre_handle()) hypre_profile_times[HYPRE_TIMER_ID_SPTRANS] += hypre_MPI_Wtime(); #endif diff --git a/src/seq_mv/vector.c b/src/seq_mv/vector.c index 8b024f39c5..bfab868fbb 100644 --- a/src/seq_mv/vector.c +++ b/src/seq_mv/vector.c @@ -300,7 +300,7 @@ hypre_SeqVectorSetConstantValues( hypre_Vector *v, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -488,7 +488,7 @@ hypre_SeqVectorScale( HYPRE_Complex alpha, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -542,7 +542,7 @@ hypre_SeqVectorAxpy( HYPRE_Complex alpha, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -596,7 +596,7 @@ hypre_SeqVectorElmdivpy( hypre_Vector *x, } #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -647,7 +647,7 @@ hypre_SeqVectorElmdivpyMarked( hypre_Vector *x, } #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -704,7 +704,7 @@ hypre_SeqVectorInnerProd( hypre_Vector *x, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -806,7 +806,7 @@ hypre_SeqVectorMax( HYPRE_Complex alpha, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #ifdef HYPRE_PROFILE hypre_profile_times[HYPRE_TIMER_ID_BLAS1] += hypre_MPI_Wtime(); diff --git a/src/sstruct_mv/sstruct_matrix.c b/src/sstruct_mv/sstruct_matrix.c index 1d9ce85366..e51066abcc 100644 --- a/src/sstruct_mv/sstruct_matrix.c +++ b/src/sstruct_mv/sstruct_matrix.c @@ -392,7 +392,7 @@ hypre_SStructPMatrixSetBoxValues( hypre_SStructPMatrix *pmatrix, values, action, -1, 0); /* TODO: Why need DeviceSync? */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif /* set (AddTo/Get) or clear (Set) values outside the grid in ghost zones */ if (action != 0) diff --git a/src/sstruct_mv/sstruct_vector.c b/src/sstruct_mv/sstruct_vector.c index fdeeae6421..fa8db02a35 100644 --- a/src/sstruct_mv/sstruct_vector.c +++ b/src/sstruct_mv/sstruct_vector.c @@ -247,7 +247,7 @@ hypre_SStructPVectorSetBoxValues( hypre_SStructPVector *pvector, /* TODO: Why need DeviceSync? */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif /* set (AddTo/Get) or clear (Set) values outside the grid in ghost zones */ if (action != 0) diff --git a/src/test/ij.c b/src/test/ij.c index 26640554c7..a3dcfc76b3 100644 --- a/src/test/ij.c +++ b/src/test/ij.c @@ -3406,7 +3406,7 @@ main( hypre_int argc, } #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif hypre_EndTiming(time_index); @@ -3766,7 +3766,7 @@ main( hypre_int argc, #endif #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif hypre_EndTiming(time_index); @@ -3804,7 +3804,7 @@ main( hypre_int argc, #endif #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif hypre_EndTiming(time_index); @@ -3865,7 +3865,7 @@ main( hypre_int argc, #endif #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif tt = hypre_MPI_Wtime() - tt; @@ -3897,7 +3897,7 @@ main( hypre_int argc, #endif #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif tt = hypre_MPI_Wtime() - tt; diff --git a/src/test/ij_assembly.c b/src/test/ij_assembly.c index bb17d32803..fb28c9ba55 100644 --- a/src/test/ij_assembly.c +++ b/src/test/ij_assembly.c @@ -678,7 +678,7 @@ test_Set(MPI_Comm comm, chunk_size = nrows / nchunks; #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStart(); #endif @@ -707,7 +707,7 @@ test_Set(MPI_Comm comm, HYPRE_IJMatrixAssemble(ij_A); #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStop(); #endif @@ -831,7 +831,7 @@ test_SetOffProc(HYPRE_ParCSRMatrix parcsr_A, chunk_size = nrows / nchunks; #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif time_index = hypre_InitializeTiming("Test SetValues OffProc"); @@ -862,7 +862,7 @@ test_SetOffProc(HYPRE_ParCSRMatrix parcsr_A, //cudaProfilerStop(); #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif hypre_EndTiming(time_index); @@ -945,7 +945,7 @@ test_SetSet(MPI_Comm comm, #endif #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStart(); #endif @@ -996,7 +996,7 @@ test_SetSet(MPI_Comm comm, HYPRE_IJMatrixAssemble(ij_A); #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStop(); #endif @@ -1072,7 +1072,7 @@ test_AddSet(MPI_Comm comm, #endif #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStart(); #endif @@ -1120,7 +1120,7 @@ test_AddSet(MPI_Comm comm, HYPRE_IJMatrixAssemble(ij_A); #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStop(); #endif @@ -1178,7 +1178,7 @@ test_SetAddSet(MPI_Comm comm, chunk_size = nrows / nchunks; #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStart(); #endif @@ -1244,7 +1244,7 @@ test_SetAddSet(MPI_Comm comm, HYPRE_IJMatrixAssemble(ij_A); #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStop(); #endif diff --git a/src/test/ij_mm.c b/src/test/ij_mm.c index 4bbf24fc39..807e9b1630 100644 --- a/src/test/ij_mm.c +++ b/src/test/ij_mm.c @@ -161,7 +161,7 @@ void runjob1( HYPRE_ParCSRMatrix parcsr_A, if (i == rep - 1) { - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); //cudaProfilerStop(); hypre_EndTiming(time_index); hypre_PrintTiming("Device Parcsr Matrix-by-Matrix, A*A", hypre_MPI_COMM_WORLD); @@ -350,7 +350,7 @@ void runjob2( HYPRE_ParCSRMatrix parcsr_A, if (i == 1) { - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); //cudaProfilerStop(); hypre_EndTiming(time_index); hypre_PrintTiming("Device Parcsr Matrix-by-Matrix, RAP2", hypre_MPI_COMM_WORLD); @@ -452,7 +452,7 @@ main( hypre_int argc, HYPRE_Init(); /* for timing, sync after kernels */ - hypre_SetSyncCudaCompute(1); + hypre_SetSyncDeviceCompute(1); #if defined(HYPRE_USING_CUDA) hypre_HandleDefaultExecPolicy(hypre_handle()) = HYPRE_EXEC_DEVICE; diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index 0df44e6bea..d32c9a0c79 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -1740,8 +1740,8 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR hypre_UnorderedBigIntMap *inverse_map); #if defined(HYPRE_USING_GPU) -HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle); -HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SyncDevice(hypre_Handle *hypre_handle); HYPRE_Int hypre_ResetCudaDevice(hypre_Handle *hypre_handle); HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y); @@ -1772,10 +1772,10 @@ HYPRE_Int hypre_multmod(HYPRE_Int a, HYPRE_Int b, HYPRE_Int mod); void hypre_partition1D(HYPRE_Int n, HYPRE_Int p, HYPRE_Int j, HYPRE_Int *s, HYPRE_Int *e); char *hypre_strcpy(char *destination, const char *source); -HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action); -HYPRE_Int hypre_RestoreSyncCudaCompute(); -HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr); -HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SetSyncDeviceCompute(HYPRE_Int action); +HYPRE_Int hypre_RestoreSyncDeviceCompute(); +HYPRE_Int hypre_GetSyncDeviceCompute(HYPRE_Int *device_compute_stream_sync_ptr); +HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle); /* handle.c */ HYPRE_Int hypre_SetSpGemmUseCusparse( HYPRE_Int use_cusparse ); diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 38edeac91d..e93a2b55c6 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -443,9 +443,9 @@ using namespace thrust::placeholders; #if defined(HYPRE_DEBUG) #if defined(HYPRE_USING_CUDA) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } #elif defined(HYPRE_USING_HIP) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } #endif #else // #if defined(HYPRE_DEBUG) #define GPU_LAUNCH_SYNC @@ -1087,7 +1087,7 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); #if defined(HYPRE_DEBUG) #if defined(HYPRE_USING_CUDA) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } #endif #else // #if defined(HYPRE_DEBUG) #define GPU_LAUNCH_SYNC diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index 946a75c4be..d0594c7166 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -84,7 +84,7 @@ void hypre_CudaCompileFlagCheck() HYPRE_CUDA_CALL( cudaMalloc(&cuda_arch_compile_d, sizeof(hypre_int)) ); hypre_TMemcpy(cuda_arch_compile_d, &cuda_arch_compile, hypre_int, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d ); hypre_TMemcpy(&cuda_arch_compile, cuda_arch_compile_d, hypre_int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); //hypre_TFree(cuda_arch_compile_d, HYPRE_MEMORY_DEVICE); @@ -1373,7 +1373,7 @@ hypre_DeviceDataDestroy(hypre_DeviceData *data) } HYPRE_Int -hypre_SyncCudaDevice(hypre_Handle *hypre_handle) +hypre_SyncDevice(hypre_Handle *hypre_handle) { #if defined(HYPRE_USING_DEVICE_OPENMP) HYPRE_CUDA_CALL( cudaDeviceSynchronize() ); @@ -1381,6 +1381,8 @@ hypre_SyncCudaDevice(hypre_Handle *hypre_handle) HYPRE_CUDA_CALL( cudaDeviceSynchronize() ); #elif defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipDeviceSynchronize() ); +#elif defined(HYPRE_USING_SYCL) + HYPRE_SYCL_CALL( hypre_HandleComputeStream(hypre_handle)->wait_and_throw() ); #endif return hypre_error_flag; } @@ -1400,55 +1402,57 @@ hypre_ResetCudaDevice(hypre_Handle *hypre_handle) * action: 0: set sync stream to false * 1: set sync stream to true * 2: restore sync stream to default - * 3: return the current value of cuda_compute_stream_sync - * 4: sync stream based on cuda_compute_stream_sync + * 3: return the current value of device_compute_stream_sync + * 4: sync stream based on device_compute_stream_sync */ HYPRE_Int -hypre_SyncCudaComputeStream_core(HYPRE_Int action, - hypre_Handle *hypre_handle, - HYPRE_Int *cuda_compute_stream_sync_ptr) +hypre_SyncDeviceComputeStream_core(HYPRE_Int action, + hypre_Handle *hypre_handle, + HYPRE_Int *device_compute_stream_sync_ptr) { /* with UVM the default is to sync at kernel completions, since host is also able to * touch GPU memory */ #if defined(HYPRE_USING_UNIFIED_MEMORY) - static const HYPRE_Int cuda_compute_stream_sync_default = 1; + static const HYPRE_Int device_compute_stream_sync_default = 1; #else - static const HYPRE_Int cuda_compute_stream_sync_default = 0; + static const HYPRE_Int device_compute_stream_sync_default = 0; #endif /* this controls if synchronize the stream after computations */ - static HYPRE_Int cuda_compute_stream_sync = cuda_compute_stream_sync_default; + static HYPRE_Int device_compute_stream_sync = device_compute_stream_sync_default; switch (action) { case 0: - cuda_compute_stream_sync = 0; + device_compute_stream_sync = 0; break; case 1: - cuda_compute_stream_sync = 1; + device_compute_stream_sync = 1; break; case 2: - cuda_compute_stream_sync = cuda_compute_stream_sync_default; + device_compute_stream_sync = device_compute_stream_sync_default; break; case 3: - *cuda_compute_stream_sync_ptr = cuda_compute_stream_sync; + *device_compute_stream_sync_ptr = device_compute_stream_sync; break; case 4: #if defined(HYPRE_USING_DEVICE_OPENMP) HYPRE_CUDA_CALL( cudaDeviceSynchronize() ); #else - if (cuda_compute_stream_sync) + if (device_compute_stream_sync) { #if defined(HYPRE_USING_CUDA) HYPRE_CUDA_CALL( cudaStreamSynchronize(hypre_HandleComputeStream(hypre_handle)) ); #elif defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipStreamSynchronize(hypre_HandleComputeStream(hypre_handle)) ); +#elif defined(HYPRE_USING_SYCL) + HYPRE_SYCL_CALL( hypre_HandleComputeStream(hypre_handle)->ext_oneapi_submit_barrier() ); #endif } #endif break; default: - hypre_printf("hypre_SyncCudaComputeStream_core invalid action\n"); + hypre_printf("hypre_SyncDeviceComputeStream_core invalid action\n"); hypre_error_in_arg(1); } @@ -1456,35 +1460,35 @@ hypre_SyncCudaComputeStream_core(HYPRE_Int action, } HYPRE_Int -hypre_SetSyncCudaCompute(HYPRE_Int action) +hypre_SetSyncDeviceCompute(HYPRE_Int action) { /* convert to 1/0 */ action = action != 0; - hypre_SyncCudaComputeStream_core(action, NULL, NULL); + hypre_SyncDeviceComputeStream_core(action, NULL, NULL); return hypre_error_flag; } HYPRE_Int -hypre_RestoreSyncCudaCompute() +hypre_RestoreSyncDeviceCompute() { - hypre_SyncCudaComputeStream_core(2, NULL, NULL); + hypre_SyncDeviceComputeStream_core(2, NULL, NULL); return hypre_error_flag; } HYPRE_Int -hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr) +hypre_GetSyncDeviceCompute(HYPRE_Int *device_compute_stream_sync_ptr) { - hypre_SyncCudaComputeStream_core(3, NULL, cuda_compute_stream_sync_ptr); + hypre_SyncDeviceComputeStream_core(3, NULL, device_compute_stream_sync_ptr); return hypre_error_flag; } HYPRE_Int -hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle) +hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle) { - hypre_SyncCudaComputeStream_core(4, hypre_handle, NULL); + hypre_SyncDeviceComputeStream_core(4, hypre_handle, NULL); return hypre_error_flag; } diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 4bddafa330..59549ca6db 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -386,9 +386,9 @@ using namespace thrust::placeholders; #if defined(HYPRE_DEBUG) #if defined(HYPRE_USING_CUDA) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } #elif defined(HYPRE_USING_HIP) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } #endif #else // #if defined(HYPRE_DEBUG) #define GPU_LAUNCH_SYNC @@ -1030,7 +1030,7 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); #if defined(HYPRE_DEBUG) #if defined(HYPRE_USING_CUDA) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } #endif #else // #if defined(HYPRE_DEBUG) #define GPU_LAUNCH_SYNC diff --git a/src/utilities/int_array.c b/src/utilities/int_array.c index 7a51fbb80d..65ea3f5ef9 100644 --- a/src/utilities/int_array.c +++ b/src/utilities/int_array.c @@ -168,7 +168,7 @@ hypre_IntArraySetConstantValues( hypre_IntArray *v, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif return ierr; diff --git a/src/utilities/protos.h b/src/utilities/protos.h index eb41f99847..ad3b5ff8a8 100644 --- a/src/utilities/protos.h +++ b/src/utilities/protos.h @@ -269,8 +269,8 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR hypre_UnorderedBigIntMap *inverse_map); #if defined(HYPRE_USING_GPU) -HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle); -HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SyncDevice(hypre_Handle *hypre_handle); HYPRE_Int hypre_ResetCudaDevice(hypre_Handle *hypre_handle); HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y); @@ -301,10 +301,10 @@ HYPRE_Int hypre_multmod(HYPRE_Int a, HYPRE_Int b, HYPRE_Int mod); void hypre_partition1D(HYPRE_Int n, HYPRE_Int p, HYPRE_Int j, HYPRE_Int *s, HYPRE_Int *e); char *hypre_strcpy(char *destination, const char *source); -HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action); -HYPRE_Int hypre_RestoreSyncCudaCompute(); -HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr); -HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SetSyncDeviceCompute(HYPRE_Int action); +HYPRE_Int hypre_RestoreSyncDeviceCompute(); +HYPRE_Int hypre_GetSyncDeviceCompute(HYPRE_Int *device_compute_stream_sync_ptr); +HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle); /* handle.c */ HYPRE_Int hypre_SetSpGemmUseCusparse( HYPRE_Int use_cusparse ); From 243e2b8f8fd3cb59f9088cc2359c268dd0490efa Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Thu, 9 Dec 2021 21:15:57 +0000 Subject: [PATCH 36/44] [SYCL] update, unify new functions for CUDA and SYCL in csr_matop_device --- src/seq_mv/csr_matop_device.c | 758 +++++++++++------------------ src/utilities/HYPRE_utilities.h | 6 +- src/utilities/_hypre_utilities.h | 5 + src/utilities/_hypre_utilities.hpp | 77 +-- src/utilities/device_utils.c | 71 ++- src/utilities/device_utils.h | 45 +- src/utilities/memory.h | 5 + 7 files changed, 421 insertions(+), 546 deletions(-) diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c index ff79ec97f4..cd6e819515 100644 --- a/src/seq_mv/csr_matop_device.c +++ b/src/seq_mv/csr_matop_device.c @@ -1378,139 +1378,6 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A, return hypre_error_flag; } - -HYPRE_Int -hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) -{ - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - dim3 bDim, gDim; - - bDim = hypre_GetDefaultDeviceBlockDimension(); - gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - - HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim, - nrows, A_i, A_j, A_data); - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return hypre_error_flag; -} - -HYPRE_Int -hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A ) -{ - if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) - { - return 0; - } - - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim); - - HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRCheckDiagFirst, gDim, bDim, - hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result ); - - HYPRE_Int ierr = HYPRE_THRUST_CALL( reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); - - hypre_TFree(result, HYPRE_MEMORY_DEVICE); - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return ierr; -} - -/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v - * Does NOT assume diagonal is the first entry of each row of A - * In debug mode: - * Returns the number of rows that do not have diag in the pattern - * (i.e., structural zeroes on the diagonal) - */ -HYPRE_Int -hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A, - HYPRE_Complex v, - HYPRE_Real tol ) -{ - HYPRE_Int ierr = 0; - - if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) - { - return ierr; - } - - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); - -#if HYPRE_DEBUG - HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); -#else - HYPRE_Int *result = NULL; -#endif - - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim, - v, hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), - tol, result ); - -#if HYPRE_DEBUG - ierr = HYPRE_THRUST_CALL( reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); - - hypre_TFree(result, HYPRE_MEMORY_DEVICE); -#endif - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return ierr; -} - -HYPRE_Int -hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, - HYPRE_Complex *new_diag, - HYPRE_Complex v, - HYPRE_Real tol ) -{ - HYPRE_Int ierr = 0; - - if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) - { - return ierr; - } - - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); - -#if HYPRE_DEBUG - HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); -#else - HYPRE_Int *result = NULL; -#endif - - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim, - new_diag, v, hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), - tol, result ); - -#if HYPRE_DEBUG - ierr = HYPRE_THRUST_CALL( reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); - - hypre_TFree(result, HYPRE_MEMORY_DEVICE); -#endif - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return ierr; -} - typedef thrust::tuple Int2; struct Int2Unequal : public thrust::unary_function { @@ -1595,62 +1462,6 @@ hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A) return hypre_error_flag; } -void -hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, - HYPRE_Int *CF_i, - HYPRE_Int *CF_j, - HYPRE_Complex *row_sum, - HYPRE_Int type, - HYPRE_Complex scal, - const char *set_or_add) -{ - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - dim3 bDim, gDim; - - bDim = hypre_GetDefaultDeviceBlockDimension(); - gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - - if (type == 0) - { - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, - row_sum, scal, set_or_add[0] == 's' ); - } - else if (type == 1) - { - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, - row_sum, scal, set_or_add[0] == 's' ); - } - else if (type == 2) - { - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, - row_sum, scal, set_or_add[0] == 's' ); - } - - hypre_SyncDeviceComputeStream(hypre_handle()); -} - -void -hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A, - HYPRE_Complex *d, - HYPRE_Int type) -{ - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - dim3 bDim, gDim; - - bDim = hypre_GetDefaultDeviceBlockDimension(); - gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type ); - - hypre_SyncDeviceComputeStream(hypre_handle()); -} - /* return C = [A; B] */ hypre_CSRMatrix* hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B) @@ -1819,49 +1630,6 @@ hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A, return hypre_error_flag; } -/* markA: array of size nnz(A), for pattern of (A and B), markA is the column indices as in A_J - * Otherwise, mark pattern not in A-B as -1 in markA - * Note the special treatment for diagonal entries of A (marked as -2) */ -HYPRE_Int -hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A, - hypre_CSRMatrix *B, - HYPRE_Int *markA, - HYPRE_Int diag_opt) -{ - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnzA = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int nnzB = hypre_CSRMatrixNumNonzeros(B); - - HYPRE_Int *Cii = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); - HYPRE_Int *Cjj = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); - HYPRE_Int *idx = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); - - hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzA, hypre_CSRMatrixI(A), Cii); - hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA); - hypre_TMemcpy(Cjj, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL( sequence, idx, idx + nnzA + nnzB ); - - HYPRE_THRUST_CALL( stable_sort_by_key, - thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)), - thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)) + nnzA + nnzB, - idx ); - - hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim); - - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixIntersectPattern, gDim, bDim, - nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt ); - - hypre_TFree(Cii, HYPRE_MEMORY_DEVICE); - hypre_TFree(Cjj, HYPRE_MEMORY_DEVICE); - hypre_TFree(idx, HYPRE_MEMORY_DEVICE); - - return hypre_error_flag; -} - #endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_SYCL) @@ -2167,7 +1935,7 @@ hypre_CSRMatrixSplitDevice_core( HYPRE_Int job, /* 0: qu } HYPRE_BigInt *const_iterator = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - hypre_DeviceDataComputeStream(hypre_handle())->fill(const_iterator, first_col_diag_B, B_ext_diag_nnz*sizeof(HYPRE_BigInt)).wait(); + hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, first_col_diag_B, B_ext_diag_nnz*sizeof(HYPRE_BigInt)).wait(); HYPRE_ONEDPL_CALL( std::transform, B_ext_diag_bigj, B_ext_diag_bigj + B_ext_diag_nnz, @@ -2333,8 +2101,8 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A, reduced_col_nnz = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); // ABB: Replace values in-place with dpct::make_constant_iterator(1) - HYPRE_Int* values = hypre_TAlloc(HYPRE_Int, nnz_A, hypre_MEMORY_UNIFIED); - hypre_DeviceDataComputeStream(hypre_handle())->fill(values, 1, nnz_A*sizeof(HYPRE_Int)).wait(); + HYPRE_Int* values = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_UNIFIED); + hypre_HandleComputeStream(hypre_handle())->fill(values, 1, nnz_A*sizeof(HYPRE_Int)).wait(); std::pair new_end = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce_by_segment, A_j_sorted, A_j_sorted + nnz_A, values, @@ -2359,163 +2127,30 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A, return hypre_error_flag; } - HYPRE_Int -hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) +hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A) { HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); HYPRE_Int *A_i = hypre_CSRMatrixI(A); HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - sycl::range<1> bDim, gDim; - - bDim = hypre_GetDefaultDeviceBlockDimension(); - gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - - HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim, - nrows, A_i, A_j, A_data); - - hypre_SyncDeviceComputeStream(hypre_handle()); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); + HYPRE_Int new_nnz; + HYPRE_Int *new_ii; + HYPRE_Int *new_j; + HYPRE_Complex *new_data; - return hypre_error_flag; -} + auto zipped_begin = oneapi::dpl::make_zip_iterator(A_ii, A_j); + new_nnz = HYPRE_ONEDPL_CALL( std::count_if, + zipped_begin, zipped_begin + nnz, + [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); -HYPRE_Int -hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A ) -{ - if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + if (new_nnz == nnz) { - return 0; - } - - sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); - sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim); - - HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRCheckDiagFirst, gDim, bDim, - hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result ); - - HYPRE_Int ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); - - hypre_TFree(result, HYPRE_MEMORY_DEVICE); - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return ierr; -} - -/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v - * Does NOT assume diagonal is the first entry of each row of A - * In debug mode: - * Returns the number of rows that do not have diag in the pattern - * (i.e., structural zeroes on the diagonal) - */ -HYPRE_Int -hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A, - HYPRE_Complex v, - HYPRE_Real tol ) -{ - HYPRE_Int ierr = 0; - - if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) - { - return ierr; - } - - sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); - sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); - -#if HYPRE_DEBUG - HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); -#else - HYPRE_Int *result = NULL; -#endif - - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim, - v, hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), - tol, result ); - -#if HYPRE_DEBUG - ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); - - hypre_TFree(result, HYPRE_MEMORY_DEVICE); -#endif - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return ierr; -} - -HYPRE_Int -hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, - HYPRE_Complex *new_diag, - HYPRE_Complex v, - HYPRE_Real tol ) -{ - HYPRE_Int ierr = 0; - - if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) - { - return ierr; - } - - sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); - sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); - -#if HYPRE_DEBUG - HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); -#else - HYPRE_Int *result = NULL; -#endif - - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim, - new_diag, v, hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), - tol, result ); - -#if HYPRE_DEBUG - ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); - - hypre_TFree(result, HYPRE_MEMORY_DEVICE); -#endif - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return ierr; -} - -HYPRE_Int -hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A) -{ - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); - HYPRE_Int new_nnz; - HYPRE_Int *new_ii; - HYPRE_Int *new_j; - HYPRE_Complex *new_data; - - auto zipped_begin = oneapi::dpl::make_zip_iterator(A_ii, A_j); - new_nnz = HYPRE_ONEDPL_CALL( std::count_if, - zipped_begin, zipped_begin + nnz, - [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); - - if (new_nnz == nnz) - { - /* no diagonal entries found */ - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - return hypre_error_flag; + /* no diagonal entries found */ + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + return hypre_error_flag; } new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); @@ -2564,62 +2199,6 @@ hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A) return hypre_error_flag; } -void -hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, - HYPRE_Int *CF_i, - HYPRE_Int *CF_j, - HYPRE_Complex *row_sum, - HYPRE_Int type, - HYPRE_Complex scal, - const char *set_or_add) -{ - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - sycl::range<1> bDim, gDim; - - bDim = hypre_GetDefaultDeviceBlockDimension(); - gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - - if (type == 0) - { - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, - row_sum, scal, set_or_add[0] == 's' ); - } - else if (type == 1) - { - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, - row_sum, scal, set_or_add[0] == 's' ); - } - else if (type == 2) - { - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, - row_sum, scal, set_or_add[0] == 's' ); - } - - hypre_SyncDeviceComputeStream(hypre_handle()); -} - -void -hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A, - HYPRE_Complex *d, - HYPRE_Int type) -{ - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - sycl::range<1> bDim, gDim; - - bDim = hypre_GetDefaultDeviceBlockDimension(); - gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type ); - - hypre_SyncDeviceComputeStream(hypre_handle()); -} - /* return C = [A; B] */ hypre_CSRMatrix* hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B) @@ -2640,7 +2219,7 @@ hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B) HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); HYPRE_Int *const_iterator = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE); - hypre_DeviceDataComputeStream(hypre_handle())->fill(const_iterator, hypre_CSRMatrixNumNonzeros(A), (hypre_CSRMatrixNumRows(C) + 1)*sizeof(HYPRE_Int)).wait(); + hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, hypre_CSRMatrixNumNonzeros(A), (hypre_CSRMatrixNumRows(C) + 1)*sizeof(HYPRE_Int)).wait(); HYPRE_ONEDPL_CALL( std::transform, C_i + hypre_CSRMatrixNumRows(A) + 1, C_i + hypre_CSRMatrixNumRows(C) + 1, @@ -2782,6 +2361,158 @@ hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A, return hypre_error_flag; } +#endif /* HYPRE_USING_SYCL */ + + +#if defined(HYPRE_USING_GPU) + +/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v + * Does NOT assume diagonal is the first entry of each row of A + * In debug mode: + * Returns the number of rows that do not have diag in the pattern + * (i.e., structural zeroes on the diagonal) + */ +HYPRE_Int +hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A, + HYPRE_Complex v, + HYPRE_Real tol ) +{ + HYPRE_Int ierr = 0; + + if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + { + return ierr; + } + + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); + +#if HYPRE_DEBUG + HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); +#else + HYPRE_Int *result = NULL; +#endif + + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim, + v, hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), + tol, result ); + +#if HYPRE_DEBUG +#if defined(HYPRE_USING_CUDA) + ierr = HYPRE_THRUST_CALL( reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#elif defined(HYPRE_USING_SYCL) + ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#endif + hypre_TFree(result, HYPRE_MEMORY_DEVICE); +#endif // HYPRE_DEBUG + + hypre_SyncDeviceComputeStream(hypre_handle()); + + return ierr; +} + +HYPRE_Int +hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, + HYPRE_Complex *new_diag, + HYPRE_Complex v, + HYPRE_Real tol ) +{ + HYPRE_Int ierr = 0; + + if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + { + return ierr; + } + + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); + +#if HYPRE_DEBUG + HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); +#else + HYPRE_Int *result = NULL; +#endif + + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim, + new_diag, v, hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), + tol, result ); + +#if HYPRE_DEBUG +#if defined(HYPRE_USING_CUDA) + ierr = HYPRE_THRUST_CALL( reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#elif defined(HYPRE_USING_SYCL) + ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#endif + hypre_TFree(result, HYPRE_MEMORY_DEVICE); +#endif // HYPRE_DEBUG + + hypre_SyncDeviceComputeStream(hypre_handle()); + + return ierr; +} + +HYPRE_Int +hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A ) +{ + if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + { + return 0; + } + + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim); + + HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRCheckDiagFirst, gDim, bDim, + hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result ); + +#if defined(HYPRE_USING_CUDA) + HYPRE_Int ierr = HYPRE_THRUST_CALL( reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#elif defined(HYPRE_USING_SYCL) + HYPRE_Int ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#endif + + hypre_TFree(result, HYPRE_MEMORY_DEVICE); + + hypre_SyncDeviceComputeStream(hypre_handle()); + + return ierr; +} + +HYPRE_Int +hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) +{ + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); + + HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim, + nrows, A_i, A_j, A_data); + + hypre_SyncDeviceComputeStream(hypre_handle()); + + return hypre_error_flag; +} + /* markA: array of size nnz(A), for pattern of (A and B), markA is the column indices as in A_J * Otherwise, mark pattern not in A-B as -1 in markA * Note the special treatment for diagonal entries of A (marked as -2) */ @@ -2803,17 +2534,26 @@ hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A, hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA); hypre_TMemcpy(Cjj, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + +#if defined(HYPRE_USING_CUDA) + HYPRE_THRUST_CALL( sequence, idx, idx + nnzA + nnzB ); + + HYPRE_THRUST_CALL( stable_sort_by_key, + thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)), + thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)) + nnzA + nnzB, + idx ); +#elif defined(HYPRE_USING_SYCL) HYPRE_ONEDPL_CALL( dpct::iota, idx, idx + nnzA + nnzB, 0 ); - auto keys_begin = oneapi::dpl::make_zip_iterator(Cii, Cjj); - auto zipped_begin = oneapi::dpl::make_zip_iterator(keys_begin, idx); + auto zipped_begin = oneapi::dpl::make_zip_iterator(Cii, Cjj, idx); HYPRE_ONEDPL_CALL( std::stable_sort, zipped_begin, zipped_begin + nnzA + nnzB, [](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); } ); +#endif hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension(); - sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim); HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixIntersectPattern, gDim, bDim, nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt ); @@ -2825,62 +2565,112 @@ hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A, return hypre_error_flag; } -#endif /* HYPRE_USING_SYCL */ +void +hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A, + HYPRE_Complex *d, + HYPRE_Int type) +{ + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); + + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type ); + + hypre_SyncDeviceComputeStream(hypre_handle()); +} + +void +hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, + HYPRE_Int *CF_i, + HYPRE_Int *CF_j, + HYPRE_Complex *row_sum, + HYPRE_Int type, + HYPRE_Complex scal, + const char *set_or_add) +{ + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); + + if (type == 0) + { + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, + row_sum, scal, set_or_add[0] == 's' ); + } + else if (type == 1) + { + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, + row_sum, scal, set_or_add[0] == 's' ); + } + else if (type == 2) + { + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, + row_sum, scal, set_or_add[0] == 's' ); + } + + hypre_SyncDeviceComputeStream(hypre_handle()); +} -#if defined(HYPRE_USING_GPU) HYPRE_Int hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix *A, hypre_CSRMatrix **AT_ptr, HYPRE_Int data) { - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Int nrows_A = hypre_CSRMatrixNumRows(A); - HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); - HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Complex *C_data; - HYPRE_Int *C_i; - HYPRE_Int *C_j; - hypre_CSRMatrix *C; - - - /* trivial case */ - if (nnz_A == 0) - { - C_i = hypre_CTAlloc(HYPRE_Int, ncols_A + 1, HYPRE_MEMORY_DEVICE); - C_j = hypre_CTAlloc(HYPRE_Int, 0, HYPRE_MEMORY_DEVICE); - C_data = hypre_CTAlloc(HYPRE_Complex, 0, HYPRE_MEMORY_DEVICE); - } - else - { + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Int nrows_A = hypre_CSRMatrixNumRows(A); + HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); + HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Complex *C_data; + HYPRE_Int *C_i; + HYPRE_Int *C_j; + hypre_CSRMatrix *C; + + + /* trivial case */ + if (nnz_A == 0) + { + C_i = hypre_CTAlloc(HYPRE_Int, ncols_A + 1, HYPRE_MEMORY_DEVICE); + C_j = hypre_CTAlloc(HYPRE_Int, 0, HYPRE_MEMORY_DEVICE); + C_data = hypre_CTAlloc(HYPRE_Complex, 0, HYPRE_MEMORY_DEVICE); + } + else + { #if defined(HYPRE_USING_CUSPARSE) - hypreDevice_CSRSpTransCusparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, - data); + hypreDevice_CSRSpTransCusparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, + data); #elif defined(HYPRE_USING_ROCSPARSE) - hypreDevice_CSRSpTransRocsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, - data); + hypreDevice_CSRSpTransRocsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, + data); #elif defined(HYPRE_USING_ONEMKLSPARSE) - hypreDevice_CSRSpTransOnemklsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, - data); + hypreDevice_CSRSpTransOnemklsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, + data); #else - hypreDevice_CSRSpTrans(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data); + hypreDevice_CSRSpTrans(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data); #endif - } + } - C = hypre_CSRMatrixCreate(ncols_A, nrows_A, nnz_A); - hypre_CSRMatrixI(C) = C_i; - hypre_CSRMatrixJ(C) = C_j; - hypre_CSRMatrixData(C) = C_data; - hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; + C = hypre_CSRMatrixCreate(ncols_A, nrows_A, nnz_A); + hypre_CSRMatrixI(C) = C_i; + hypre_CSRMatrixJ(C) = C_j; + hypre_CSRMatrixData(C) = C_data; + hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - *AT_ptr = C; + *AT_ptr = C; - hypre_SyncDeviceComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); - return hypre_error_flag; + return hypre_error_flag; } #endif /* #if defined(HYPRE_USING_GPU) */ diff --git a/src/utilities/HYPRE_utilities.h b/src/utilities/HYPRE_utilities.h index 14fe32b136..f8bbb154f8 100644 --- a/src/utilities/HYPRE_utilities.h +++ b/src/utilities/HYPRE_utilities.h @@ -185,11 +185,15 @@ HYPRE_Int HYPRE_AssumedPartitionCheck(); * HYPRE memory location *--------------------------------------------------------------------------*/ +// ABB: HYPRE_MEMORY_UNIFIED for the case of allocating SHARED memory +// specifically at some locations and everywhere as can be enabled +// with HYPRE_USING_UNIFIED_MEMORY macro typedef enum _HYPRE_MemoryLocation { HYPRE_MEMORY_UNDEFINED = -1, HYPRE_MEMORY_HOST, - HYPRE_MEMORY_DEVICE + HYPRE_MEMORY_DEVICE, + HYPRE_MEMORY_UNIFIED } HYPRE_MemoryLocation; HYPRE_Int HYPRE_SetMemoryLocation(HYPRE_MemoryLocation memory_location); diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index d32c9a0c79..d26bf1927b 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -635,6 +635,11 @@ hypre_GetActualMemLocation(HYPRE_MemoryLocation location) #endif } + if (location == HYPRE_MEMORY_UNIFIED) + { + return hypre_MEMORY_UNIFIED; + } + return hypre_MEMORY_UNDEFINED; } diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index e93a2b55c6..ee4ece0d96 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -110,6 +110,8 @@ struct hypre_device_allocator #elif defined(HYPRE_USING_SYCL) +typedef sycl::range<1> dim3; + /* WM: problems with this being inside extern C++ {} */ /* #include */ @@ -392,17 +394,22 @@ struct hypre_GpuMatData #define hypre_GpuMatDataMatInfo(data) ((data) -> mat_info) #define hypre_GpuMatDataSpMVBuffer(data) ((data) -> spmv_buffer) +/* device_utils.c, some common functions for CUDA, SYCL, HIP */ + +dim3 hypre_GetDefaultDeviceBlockDimension(); + +dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, + dim3 bDim ); + +HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, + HYPRE_Int *d_row_ind); + #endif //#if defined(HYPRE_USING_GPU) #if defined(HYPRE_USING_SYCL) /* device_utils.c */ HYPRE_Int HYPRE_SetSYCLDevice(sycl::device user_device); -sycl::range<1> hypre_GetDefaultDeviceBlockDimension(); - -sycl::range<1> hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, - sycl::range<1> bDim ); - #endif // #if defined(HYPRE_USING_SYCL) #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) @@ -1025,9 +1032,6 @@ HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); -HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, - HYPRE_Int *d_row_ind); - HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, @@ -1062,20 +1066,19 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); #if defined(HYPRE_USING_SYCL) -#define PSTL_USE_PARALLEL_POLICIES 0 // for libstdc++ 9 -#define _GLIBCXX_USE_TBB_PAR_BACKEND 0 // for libstdc++ 10 +#pragma once -// #include -// #include -// #include -// #include +#include +#include +#include +#include -//#include // dpct::remove_if, remove_copy_if, copy_if +#include // dpct::remove_if, remove_copy_if, copy_if, scatter_if -// #include -// #include -// #include -// #include +#include +#include +#include +#include #define __forceinline__ __inline__ __attribute__((always_inline)) @@ -1104,7 +1107,7 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); } \ else \ { \ - hypre_DeviceDataComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \ + hypre_HandleComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \ [=] (sycl::nd_item<1> item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]] { \ (kernel_name)(item, __VA_ARGS__); \ }); \ @@ -1115,7 +1118,7 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); * The following one works OK for now */ #define HYPRE_ONEDPL_CALL(func_name, ...) \ - func_name(oneapi::dpl::execution::make_device_policy(*hypre_DeviceDataComputeStream(hypre_handle()), __VA_ARGS__); + func_name(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); // /* return the number of threads in block */ // template @@ -1431,16 +1434,12 @@ T warp_reduce_sum(T in, sycl::nd_item<1>& item) // }; template -struct in_range : public std::unary_function +struct in_range { T low, up; - in_range(T low_, T up_) { low = low_; up = up_; } - bool operator()(const T &x) const - { - return (x >= low && x <= up); - } + bool operator()(const T &x) const { return (x >= low && x <= up); } }; // template @@ -1456,15 +1455,25 @@ struct in_range : public std::unary_function // } // }; -template -struct less_than : std::unary_function +#ifdef HYPRE_COMPLEX +template::value>::type> +struct less_than { - T val; - less_than(T val_) { val = val_; } - - bool operator()(const T &x) const { return (x < val); } + T val; + less_than(T val_) { val = val_; } + bool operator()(const T &x) const { return (hypre_abs(x) < hypre_abs(val)); } }; - +#else +template::value>::type> +struct less_than +{ + T val; + less_than(T val_) { val = val_; } + bool operator()(const T &x) const { return (x < val); } +}; +#endif // template // struct modulo : public thrust::unary_function // { diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index d0594c7166..d66c9a500d 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -9,15 +9,15 @@ #include "_hypre_utilities.hpp" #if defined(HYPRE_USING_SYCL) -sycl::range<1> hypre_GetDefaultDeviceBlockDimension() +dim3 hypre_GetDefaultDeviceBlockDimension() { sycl::range<1> wgDim(hypre_HandleDeviceMaxWorkGroupSize(hypre_handle())); return wgDim; } -sycl::range<1> hypre_GetDefaultDeviceGridDimension(HYPRE_Int n, - const char *granularity, - sycl::range<1> wgDim) +dim3 hypre_GetDefaultDeviceGridDimension(HYPRE_Int n, + const char *granularity, + sycl::range<1> wgDim) { HYPRE_Int num_WGs = 0; HYPRE_Int num_workitems_per_WG = wgDim[0]; @@ -42,7 +42,67 @@ sycl::range<1> hypre_GetDefaultDeviceGridDimension(HYPRE_Int n, return gDim; } -#endif + + +// /** +// * Get NNZ of each row in d_row_indices and stored the results in d_rownnz +// * All pointers are device pointers. +// * d_rownnz can be the same as d_row_indices +// */ +// void +// hypreCUDAKernel_GetRowNnz(sycl::nd_item<1>& item, +// HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, +// HYPRE_Int *d_offd_ia, +// HYPRE_Int *d_rownnz) +// { +// const HYPRE_Int global_thread_id = hypre_sycl_get_grid_thread_id<1, 1>(item); + +// if (global_thread_id < nrows) +// { +// HYPRE_Int i; + +// if (d_row_indices) +// { +// i = read_only_load(&d_row_indices[global_thread_id]); +// } +// else +// { +// i = global_thread_id; +// } + +// d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i + 1]) - read_only_load(&d_diag_ia[i]) + +// read_only_load(&d_offd_ia[i + 1]) - read_only_load(&d_offd_ia[i]); +// } +// } + +HYPRE_Int +hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, + HYPRE_Int *d_row_ind) +{ + /* trivial case */ + if (nrows <= 0 || nnz <= 0) + { + return hypre_error_flag; + } + + HYPRE_ONEDPL_CALL( std::fill, d_row_ind, d_row_ind + nnz, 0 ); + + // TODO: need to fix this by passing a "predicate" as last argument + HYPRE_ONEDPL_CALL( dpct::scatter_if, + oneapi::dpl::counting_iterator(0), + oneapi::dpl::counting_iterator(nrows), + d_row_ptr, + oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1), + [](auto t) { return std::get<0>(t) != std::get<1>(t); } ), + d_row_ind ); + + HYPRE_ONEDPL_CALL( std::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind, + sycl::maximum() ); + + return hypre_error_flag; +} + +#endif // HYPRE_USING_SYCL #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) @@ -1570,4 +1630,3 @@ hypre_bind_device( HYPRE_Int myid, return hypre_error_flag; } - diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 59549ca6db..e8fa14cca2 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -53,6 +53,8 @@ #elif defined(HYPRE_USING_SYCL) +typedef sycl::range<1> dim3; + /* WM: problems with this being inside extern C++ {} */ /* #include */ @@ -335,17 +337,22 @@ struct hypre_GpuMatData #define hypre_GpuMatDataMatInfo(data) ((data) -> mat_info) #define hypre_GpuMatDataSpMVBuffer(data) ((data) -> spmv_buffer) +/* device_utils.c, some common functions for CUDA, SYCL, HIP */ + +dim3 hypre_GetDefaultDeviceBlockDimension(); + +dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, + dim3 bDim ); + +HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, + HYPRE_Int *d_row_ind); + #endif //#if defined(HYPRE_USING_GPU) #if defined(HYPRE_USING_SYCL) /* device_utils.c */ HYPRE_Int HYPRE_SetSYCLDevice(sycl::device user_device); -sycl::range<1> hypre_GetDefaultDeviceBlockDimension(); - -sycl::range<1> hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, - sycl::range<1> bDim ); - #endif // #if defined(HYPRE_USING_SYCL) #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) @@ -968,9 +975,6 @@ HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); -HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, - HYPRE_Int *d_row_ind); - HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, @@ -1005,20 +1009,19 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); #if defined(HYPRE_USING_SYCL) -#define PSTL_USE_PARALLEL_POLICIES 0 // for libstdc++ 9 -#define _GLIBCXX_USE_TBB_PAR_BACKEND 0 // for libstdc++ 10 +#pragma once -// #include -// #include -// #include -// #include +#include +#include +#include +#include -//#include // dpct::remove_if, remove_copy_if, copy_if +#include // dpct::remove_if, remove_copy_if, copy_if, scatter_if -// #include -// #include -// #include -// #include +#include +#include +#include +#include #define __forceinline__ __inline__ __attribute__((always_inline)) @@ -1047,7 +1050,7 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); } \ else \ { \ - hypre_DeviceDataComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \ + hypre_HandleComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \ [=] (sycl::nd_item<1> item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]] { \ (kernel_name)(item, __VA_ARGS__); \ }); \ @@ -1058,7 +1061,7 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); * The following one works OK for now */ #define HYPRE_ONEDPL_CALL(func_name, ...) \ - func_name(oneapi::dpl::execution::make_device_policy(*hypre_DeviceDataComputeStream(hypre_handle()), __VA_ARGS__); + func_name(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); // /* return the number of threads in block */ // template diff --git a/src/utilities/memory.h b/src/utilities/memory.h index bd815020c1..6fcaa29a01 100644 --- a/src/utilities/memory.h +++ b/src/utilities/memory.h @@ -122,6 +122,11 @@ hypre_GetActualMemLocation(HYPRE_MemoryLocation location) #endif } + if (location == HYPRE_MEMORY_UNIFIED) + { + return hypre_MEMORY_UNIFIED; + } + return hypre_MEMORY_UNDEFINED; } From 9eb1f7f38d268d5400faede532f8f03cfaa0e735 Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Fri, 10 Dec 2021 20:23:25 +0000 Subject: [PATCH 37/44] [SYCL] enable oneDPL and some more updates --- src/config/configure.in | 2 +- src/configure | 2 +- src/struct_ls/pfmg_setup.c | 5 - src/struct_mv/struct_innerprod.c | 4 - src/utilities/_hypre_utilities.hpp | 15 +- src/utilities/device_utils.c | 226 ++++++++++++++--------------- src/utilities/device_utils.h | 15 +- 7 files changed, 130 insertions(+), 139 deletions(-) diff --git a/src/config/configure.in b/src/config/configure.in index 06e6a22796..8edcabc68c 100644 --- a/src/config/configure.in +++ b/src/config/configure.in @@ -2316,7 +2316,7 @@ AS_IF([test x"$hypre_using_sycl" == x"yes"], if test "$hypre_user_chose_cuflags" = "no" then - CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel" + CUFLAGS="-D_GLIBCXX_USE_TBB_PAR_BACKEND=0 -fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel" if test "$hypre_using_debug" = "yes" then CUFLAGS="-O0 -Wall -g ${CUFLAGS}" diff --git a/src/configure b/src/configure index 7993465afb..66d6707f63 100755 --- a/src/configure +++ b/src/configure @@ -9143,7 +9143,7 @@ $as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h if test "$hypre_user_chose_cuflags" = "no" then - CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel" + CUFLAGS="-D_GLIBCXX_USE_TBB_PAR_BACKEND=0 -fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel" if test "$hypre_using_debug" = "yes" then CUFLAGS="-O0 -Wall -g ${CUFLAGS}" diff --git a/src/struct_ls/pfmg_setup.c b/src/struct_ls/pfmg_setup.c index ad8afa5e1b..c30ba6d8e0 100644 --- a/src/struct_ls/pfmg_setup.c +++ b/src/struct_ls/pfmg_setup.c @@ -1695,7 +1695,6 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]); cyb += tcy; -#endif } hypre_BoxLoop1ReductionEnd(Ai, cyb) @@ -1707,7 +1706,6 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int bi, HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] + a_bw[Ai] + a_be[Ai] + a_bn[Ai] + a_bs[Ai]); czb += tcz; -#endif } hypre_BoxLoop1ReductionEnd(Ai, czb) @@ -1995,7 +1993,6 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, tcx -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); cxb += tcx; -#endif } hypre_BoxLoop1ReductionEnd(Ai, cxb) @@ -2010,7 +2007,6 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, tcy -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); cyb += tcy; -#endif } hypre_BoxLoop1ReductionEnd(Ai, cyb) @@ -2025,7 +2021,6 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int bi, tcz -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]); czb += tcz; -#endif } hypre_BoxLoop1ReductionEnd(Ai, czb) diff --git a/src/struct_mv/struct_innerprod.c b/src/struct_mv/struct_innerprod.c index a32c06e0e4..81d8d27f70 100644 --- a/src/struct_mv/struct_innerprod.c +++ b/src/struct_mv/struct_innerprod.c @@ -89,11 +89,7 @@ hypre_StructInnerProd( hypre_StructVector *x, box_sum) { HYPRE_Real tmp = xp[xi] * hypre_conj(yp[yi]); -#if defined(HYPRE_USING_SYCL) - hypre_sycl_sum += tmp; -#else box_sum += tmp; -#endif } hypre_BoxLoop2ReductionEnd(xi, yi, box_sum); diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index ee4ece0d96..277d4b9176 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -111,6 +111,7 @@ struct hypre_device_allocator #elif defined(HYPRE_USING_SYCL) typedef sycl::range<1> dim3; +#define __global__ /* WM: problems with this being inside extern C++ {} */ /* #include */ @@ -404,6 +405,13 @@ dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_ind); +HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); + +HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); + +HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, + HYPRE_Int *d_row_ptr); + #endif //#if defined(HYPRE_USING_GPU) #if defined(HYPRE_USING_SYCL) @@ -1030,13 +1038,6 @@ HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); -HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); - -HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); - -HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, - HYPRE_Int *d_row_ptr); - HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y, char *work); diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index d66c9a500d..85721a8145 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -8,6 +8,109 @@ #include "_hypre_utilities.h" #include "_hypre_utilities.hpp" +// some common kernels for CUDA, HIP and SYCL +#ifdef HYPRE_USING_GPU + +/** + * Get NNZ of each row in d_row_indices and stored the results in d_rownnz + * All pointers are device pointers. + * d_rownnz can be the same as d_row_indices + */ +__global__ void +hypreGPUKernel_GetRowNnz( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1> item, + #endif + HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, + HYPRE_Int *d_offd_ia, + HYPRE_Int *d_rownnz) +{ + +#ifdef HYPRE_USING_CUDA + const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1, 1>(); +#elif defined(HYPRE_USING_SYCL) + const HYPRE_Int global_thread_id = hypre_gpu_get_grid_thread_id<1,1>(item); +#endif + + if (global_thread_id < nrows) + { + HYPRE_Int i; + + if (d_row_indices) + { + i = read_only_load(&d_row_indices[global_thread_id]); + } + else + { + i = global_thread_id; + } + + d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i + 1]) - read_only_load(&d_diag_ia[i]) + + read_only_load(&d_offd_ia[i + 1]) - read_only_load(&d_offd_ia[i]); + } +} + +HYPRE_Int* +hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind) +{ + HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE); + +#ifdef HYPRE_USING_CUDA + HYPRE_THRUST_CALL( lower_bound, + d_row_ind, d_row_ind + nnz, + thrust::counting_iterator(0), + thrust::counting_iterator(nrows + 1), + d_row_ptr); +#elif defined(HYPRE_USING_SYCL) + HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, + d_row_ind, d_row_ind + nnz, + oneapi::dpl::counting_iterator(0), + oneapi::dpl::counting_iterator(nrows + 1), + d_row_ptr); +#endif + + return d_row_ptr; +} + +HYPRE_Int +hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, + HYPRE_Int *d_row_ptr) +{ +#ifdef HYPRE_USING_CUDA + HYPRE_THRUST_CALL( lower_bound, + d_row_ind, d_row_ind + nnz, + thrust::counting_iterator(0), + thrust::counting_iterator(nrows + 1), + d_row_ptr); +#elif defined(HYPRE_USING_SYCL) + HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, + d_row_ind, d_row_ind + nnz, + oneapi::dpl::counting_iterator(0), + oneapi::dpl::counting_iterator(nrows + 1), + d_row_ptr); +#endif + + return hypre_error_flag; +} + +HYPRE_Int* +hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr) +{ + /* trivial case */ + if (nrows <= 0 || nnz <= 0) + { + return NULL; + } + + HYPRE_Int *d_row_ind = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE); + + hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, d_row_ind); + + return d_row_ind; +} + +#endif // HYPRE_USING_GPU + #if defined(HYPRE_USING_SYCL) dim3 hypre_GetDefaultDeviceBlockDimension() { @@ -43,38 +146,6 @@ dim3 hypre_GetDefaultDeviceGridDimension(HYPRE_Int n, return gDim; } - -// /** -// * Get NNZ of each row in d_row_indices and stored the results in d_rownnz -// * All pointers are device pointers. -// * d_rownnz can be the same as d_row_indices -// */ -// void -// hypreCUDAKernel_GetRowNnz(sycl::nd_item<1>& item, -// HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, -// HYPRE_Int *d_offd_ia, -// HYPRE_Int *d_rownnz) -// { -// const HYPRE_Int global_thread_id = hypre_sycl_get_grid_thread_id<1, 1>(item); - -// if (global_thread_id < nrows) -// { -// HYPRE_Int i; - -// if (d_row_indices) -// { -// i = read_only_load(&d_row_indices[global_thread_id]); -// } -// else -// { -// i = global_thread_id; -// } - -// d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i + 1]) - read_only_load(&d_diag_ia[i]) + -// read_only_load(&d_offd_ia[i + 1]) - read_only_load(&d_offd_ia[i]); -// } -// } - HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_ind) @@ -87,14 +158,14 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_ HYPRE_ONEDPL_CALL( std::fill, d_row_ind, d_row_ind + nnz, 0 ); - // TODO: need to fix this by passing a "predicate" as last argument - HYPRE_ONEDPL_CALL( dpct::scatter_if, - oneapi::dpl::counting_iterator(0), - oneapi::dpl::counting_iterator(nrows), - d_row_ptr, - oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1), - [](auto t) { return std::get<0>(t) != std::get<1>(t); } ), - d_row_ind ); + /* // TODO: need to fix this by passing a "predicate" as last argument */ + /* HYPRE_ONEDPL_CALL( dpct::scatter_if, */ + /* oneapi::dpl::counting_iterator(0), */ + /* oneapi::dpl::counting_iterator(nrows), */ + /* d_row_ptr, */ + /* oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1), */ + /* [](auto t) { return std::get<0>(t) != std::get<1>(t); } ), */ + /* d_row_ind ); */ HYPRE_ONEDPL_CALL( std::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind, sycl::maximum() ); @@ -210,36 +281,6 @@ hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, return gDim; } -/** - * Get NNZ of each row in d_row_indices and stored the results in d_rownnz - * All pointers are device pointers. - * d_rownnz can be the same as d_row_indices - */ -__global__ void -hypreCUDAKernel_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, - HYPRE_Int *d_offd_ia, - HYPRE_Int *d_rownnz) -{ - const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1, 1>(); - - if (global_thread_id < nrows) - { - HYPRE_Int i; - - if (d_row_indices) - { - i = read_only_load(&d_row_indices[global_thread_id]); - } - else - { - i = global_thread_id; - } - - d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i + 1]) - read_only_load(&d_diag_ia[i]) + - read_only_load(&d_offd_ia[i + 1]) - read_only_load(&d_offd_ia[i]); - } -} - /* special case: if d_row_indices == NULL, it means d_row_indices=[0,1,...,nrows-1] */ HYPRE_Int hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, @@ -255,7 +296,7 @@ hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_di return hypre_error_flag; } - HYPRE_GPU_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia, + HYPRE_GPU_LAUNCH( hypreGPUKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia, d_offd_ia, d_rownnz ); return hypre_error_flag; @@ -465,22 +506,6 @@ struct hypre_empty_row_functor } }; -HYPRE_Int* -hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr) -{ - /* trivial case */ - if (nrows <= 0 || nnz <= 0) - { - return NULL; - } - - HYPRE_Int *d_row_ind = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE); - - hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, d_row_ind); - - return d_row_ind; -} - HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_ind) @@ -539,33 +564,6 @@ template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HY HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind); #endif -HYPRE_Int* -hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind) -{ - HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE); - - HYPRE_THRUST_CALL( lower_bound, - d_row_ind, d_row_ind + nnz, - thrust::counting_iterator(0), - thrust::counting_iterator(nrows + 1), - d_row_ptr); - - return d_row_ptr; -} - -HYPRE_Int -hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, - HYPRE_Int *d_row_ptr) -{ - HYPRE_THRUST_CALL( lower_bound, - d_row_ind, d_row_ind + nnz, - thrust::counting_iterator(0), - thrust::counting_iterator(nrows + 1), - d_row_ptr); - - return hypre_error_flag; -} - __global__ void hypreCUDAKernel_ScatterAddTrivial(HYPRE_Int n, HYPRE_Real *x, HYPRE_Int *map, HYPRE_Real *y) { diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index e8fa14cca2..54d9c8a620 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -54,6 +54,7 @@ #elif defined(HYPRE_USING_SYCL) typedef sycl::range<1> dim3; +#define __global__ /* WM: problems with this being inside extern C++ {} */ /* #include */ @@ -347,6 +348,13 @@ dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_ind); +HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); + +HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); + +HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, + HYPRE_Int *d_row_ptr); + #endif //#if defined(HYPRE_USING_GPU) #if defined(HYPRE_USING_SYCL) @@ -973,13 +981,6 @@ HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); -HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); - -HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); - -HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, - HYPRE_Int *d_row_ptr); - HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y, char *work); From f11b593119a7104899ca5c7fa989c5bb4752ea70 Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Tue, 14 Dec 2021 09:58:53 +0000 Subject: [PATCH 38/44] [SYCL] adding sycl::gather and few more common GPU functions --- src/utilities/_hypre_utilities.hpp | 42 ++++++-- src/utilities/device_utils.c | 157 ++++++++++++++++------------- src/utilities/device_utils.h | 42 ++++++-- 3 files changed, 149 insertions(+), 92 deletions(-) diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 277d4b9176..24530a4d38 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -412,6 +412,16 @@ HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, HYPRE_Int *d_row_ptr); +HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i); + +HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); + +HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); + +template +HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, + HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind); + #endif //#if defined(HYPRE_USING_GPU) #if defined(HYPRE_USING_SYCL) @@ -1017,10 +1027,6 @@ hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *val template HYPRE_Int hypreDevice_ReduceByTupleKey(HYPRE_Int N, T1 *keys1_in, T2 *keys2_in, T3 *vals_in, T1 *keys1_out, T2 *keys2_out, T3 *vals_out); -template -HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, - HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind); - template HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v); @@ -1032,12 +1038,6 @@ HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab); -HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i); - -HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); - -HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); - HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y, char *work); @@ -1089,6 +1089,28 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ +template +OutputIter hypreSycl_gather(InputIter1 map_first, InputIter1 map_last, + InputIter2 input_first, OutputIter result) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto perm_begin = + oneapi::dpl::make_permutation_iterator(input_first, map_first); + const int n = ::std::distance(map_first, map_last); + + return oneapi::dpl::copy(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())), + perm_begin, perm_begin + n, result); +} + #if defined(HYPRE_DEBUG) #if defined(HYPRE_USING_CUDA) #define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index 85721a8145..bf961145fd 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -25,11 +25,10 @@ hypreGPUKernel_GetRowNnz( HYPRE_Int *d_offd_ia, HYPRE_Int *d_rownnz) { - -#ifdef HYPRE_USING_CUDA - const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1, 1>(); -#elif defined(HYPRE_USING_SYCL) +#if defined(HYPRE_USING_SYCL) const HYPRE_Int global_thread_id = hypre_gpu_get_grid_thread_id<1,1>(item); +#else + const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1, 1>(); #endif if (global_thread_id < nrows) @@ -55,18 +54,18 @@ hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row { HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE); -#ifdef HYPRE_USING_CUDA - HYPRE_THRUST_CALL( lower_bound, - d_row_ind, d_row_ind + nnz, - thrust::counting_iterator(0), - thrust::counting_iterator(nrows + 1), - d_row_ptr); -#elif defined(HYPRE_USING_SYCL) +#if defined(HYPRE_USING_SYCL) HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, d_row_ind, d_row_ind + nnz, oneapi::dpl::counting_iterator(0), oneapi::dpl::counting_iterator(nrows + 1), d_row_ptr); +#else + HYPRE_THRUST_CALL( lower_bound, + d_row_ind, d_row_ind + nnz, + thrust::counting_iterator(0), + thrust::counting_iterator(nrows + 1), + d_row_ptr); #endif return d_row_ptr; @@ -76,18 +75,18 @@ HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, HYPRE_Int *d_row_ptr) { -#ifdef HYPRE_USING_CUDA - HYPRE_THRUST_CALL( lower_bound, - d_row_ind, d_row_ind + nnz, - thrust::counting_iterator(0), - thrust::counting_iterator(nrows + 1), - d_row_ptr); -#elif defined(HYPRE_USING_SYCL) +#if defined(HYPRE_USING_SYCL) HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, d_row_ind, d_row_ind + nnz, oneapi::dpl::counting_iterator(0), oneapi::dpl::counting_iterator(nrows + 1), d_row_ptr); +#else + HYPRE_THRUST_CALL( lower_bound, + d_row_ind, d_row_ind + nnz, + thrust::counting_iterator(0), + thrust::counting_iterator(nrows + 1), + d_row_ptr); #endif return hypre_error_flag; @@ -109,6 +108,73 @@ hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row return d_row_ind; } +HYPRE_Int +hypreDevice_IntegerReduceSum(HYPRE_Int n, HYPRE_Int *d_i) +{ +#ifdef HYPRE_USING_SYCL + return HYPRE_ONEDPL_CALL(oneapi::dpl::reduce, d_i, d_i + n); +#else + return HYPRE_THRUST_CALL(reduce, d_i, d_i + n); +#endif +} + +HYPRE_Int +hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i) +{ +#if defined(HYPRE_USING_SYCL) + HYPRE_ONEDPL_CALL(oneapi::dpl::inclusive_scan, d_i, d_i + n, d_i); +#else + HYPRE_THRUST_CALL(inclusive_scan, d_i, d_i + n, d_i); +#endif + return hypre_error_flag; +} + +HYPRE_Int +hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i) +{ +#if defined(HYPRE_USING_SYCL) + HYPRE_ONEDPL_CALL(oneapi::dpl::exclusive_scan, d_i, d_i + n, d_i); +#else + HYPRE_THRUST_CALL(exclusive_scan, d_i, d_i + n, d_i); +#endif + return hypre_error_flag; +} + +/* Input: d_row_num, of size nrows, contains the rows indices that can be BigInt or Int + * Output: d_row_ind */ +template +HYPRE_Int +hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, + T *d_row_num, T *d_row_ind) +{ + /* trivial case */ + if (nrows <= 0) + { + return hypre_error_flag; + } + + HYPRE_Int *map = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE); + + hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, map); + +#ifdef HYPRE_USING_SYCL + hypreSycl_gather(map, map + nnz, d_row_num, d_row_ind); +#else + HYPRE_THRUST_CALL(gather, map, map + nnz, d_row_num, d_row_ind); +#endif + + hypre_TFree(map, HYPRE_MEMORY_DEVICE); + + return hypre_error_flag; +} + +template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, + HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind); +#if defined(HYPRE_MIXEDINT) +template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, + HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind); +#endif + #endif // HYPRE_USING_GPU #if defined(HYPRE_USING_SYCL) @@ -167,7 +233,7 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_ /* [](auto t) { return std::get<0>(t) != std::get<1>(t); } ), */ /* d_row_ind ); */ - HYPRE_ONEDPL_CALL( std::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind, + HYPRE_ONEDPL_CALL( oneapi::dpl::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind, sycl::maximum() ); return hypre_error_flag; @@ -445,28 +511,6 @@ hypreDevice_CopyParCSRRows(HYPRE_Int nrows, return hypre_error_flag; } -HYPRE_Int -hypreDevice_IntegerReduceSum(HYPRE_Int n, HYPRE_Int *d_i) -{ - return HYPRE_THRUST_CALL(reduce, d_i, d_i + n); -} - -HYPRE_Int -hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i) -{ - HYPRE_THRUST_CALL(inclusive_scan, d_i, d_i + n, d_i); - - return hypre_error_flag; -} - -HYPRE_Int -hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i) -{ - HYPRE_THRUST_CALL(exclusive_scan, d_i, d_i + n, d_i); - - return hypre_error_flag; -} - HYPRE_Int hypreDevice_Scalen(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v) { @@ -533,37 +577,6 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_ return hypre_error_flag; } -/* Input: d_row_num, of size nrows, contains the rows indices that can be BigInt or Int - * Output: d_row_ind */ -template -HYPRE_Int -hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, - T *d_row_num, T *d_row_ind) -{ - /* trivial case */ - if (nrows <= 0) - { - return hypre_error_flag; - } - - HYPRE_Int *map = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE); - - hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, map); - - HYPRE_THRUST_CALL(gather, map, map + nnz, d_row_num, d_row_ind); - - hypre_TFree(map, HYPRE_MEMORY_DEVICE); - - return hypre_error_flag; -} - -template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, - HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind); -#if defined(HYPRE_MIXEDINT) -template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, - HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind); -#endif - __global__ void hypreCUDAKernel_ScatterAddTrivial(HYPRE_Int n, HYPRE_Real *x, HYPRE_Int *map, HYPRE_Real *y) { diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 54d9c8a620..54cc6e192e 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -355,6 +355,16 @@ HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, HYPRE_Int *d_row_ptr); +HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i); + +HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); + +HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); + +template +HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, + HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind); + #endif //#if defined(HYPRE_USING_GPU) #if defined(HYPRE_USING_SYCL) @@ -960,10 +970,6 @@ hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *val template HYPRE_Int hypreDevice_ReduceByTupleKey(HYPRE_Int N, T1 *keys1_in, T2 *keys2_in, T3 *vals_in, T1 *keys1_out, T2 *keys2_out, T3 *vals_out); -template -HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, - HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind); - template HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v); @@ -975,12 +981,6 @@ HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab); -HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i); - -HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); - -HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); - HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y, char *work); @@ -1032,6 +1032,28 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ +template +OutputIter hypreSycl_gather(InputIter1 map_first, InputIter1 map_last, + InputIter2 input_first, OutputIter result) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto perm_begin = + oneapi::dpl::make_permutation_iterator(input_first, map_first); + const int n = ::std::distance(map_first, map_last); + + return oneapi::dpl::copy(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())), + perm_begin, perm_begin + n, result); +} + #if defined(HYPRE_DEBUG) #if defined(HYPRE_USING_CUDA) #define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } From ae30f749c62cfac4234876775a61bcc094cdb6fd Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Tue, 14 Dec 2021 16:22:25 +0000 Subject: [PATCH 39/44] [SYCL] fix the sycl scatter_if --- src/utilities/device_utils.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index bf961145fd..c0dad8de28 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -224,14 +224,14 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_ HYPRE_ONEDPL_CALL( std::fill, d_row_ind, d_row_ind + nnz, 0 ); - /* // TODO: need to fix this by passing a "predicate" as last argument */ - /* HYPRE_ONEDPL_CALL( dpct::scatter_if, */ - /* oneapi::dpl::counting_iterator(0), */ - /* oneapi::dpl::counting_iterator(nrows), */ - /* d_row_ptr, */ - /* oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1), */ - /* [](auto t) { return std::get<0>(t) != std::get<1>(t); } ), */ - /* d_row_ind ); */ + HYPRE_ONEDPL_CALL( dpct::scatter_if, + oneapi::dpl::counting_iterator(0), + oneapi::dpl::counting_iterator(nrows), + d_row_ptr, + oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1), + [](auto t) { return std::get<0>(t) != std::get<1>(t); } ), + d_row_ind, + oneapi::dpl::identity() ); HYPRE_ONEDPL_CALL( oneapi::dpl::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind, sycl::maximum() ); From c73ef06e14389cfa841369edb93f727ac2209710 Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Tue, 14 Dec 2021 17:29:37 +0000 Subject: [PATCH 40/44] [SYCL] fix the build issues from std::exclusive_scan, lambda for scatter_if --- src/utilities/device_utils.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index c0dad8de28..a75f9be6d2 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -133,7 +133,7 @@ HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i) { #if defined(HYPRE_USING_SYCL) - HYPRE_ONEDPL_CALL(oneapi::dpl::exclusive_scan, d_i, d_i + n, d_i); + HYPRE_ONEDPL_CALL(std::exclusive_scan, d_i, d_i + n, d_i, 0, std::plus<>()); #else HYPRE_THRUST_CALL(exclusive_scan, d_i, d_i + n, d_i); #endif @@ -212,6 +212,16 @@ dim3 hypre_GetDefaultDeviceGridDimension(HYPRE_Int n, return gDim; } +struct hypre_empty_row_functor +{ + bool operator()(const std::tuple& t) const + { + const HYPRE_Int a = std::get<0>(t); + const HYPRE_Int b = std::get<1>(t); + return a != b; + } +}; + HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_ind) @@ -229,7 +239,7 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_ oneapi::dpl::counting_iterator(nrows), d_row_ptr, oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1), - [](auto t) { return std::get<0>(t) != std::get<1>(t); } ), + hypre_empty_row_functor() ), d_row_ind, oneapi::dpl::identity() ); From d3e3bf028664434c61f48c1285d12c9ede049db2 Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Thu, 16 Dec 2021 19:33:29 +0000 Subject: [PATCH 41/44] [SYCL] cleanup a for SYCL kernel query helper functions --- src/utilities/_hypre_utilities.hpp | 16 +++++++++++----- src/utilities/device_utils.h | 16 +++++++++++----- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 24530a4d38..23f05e1ad2 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -112,6 +112,8 @@ struct hypre_device_allocator typedef sycl::range<1> dim3; #define __global__ +#define __host__ +#define __device__ /* WM: problems with this being inside extern C++ {} */ /* #include */ @@ -1164,7 +1166,7 @@ OutputIter hypreSycl_gather(InputIter1 map_first, InputIter1 map_last, /* return the number of (sub_groups) warps in (work-group) block */ template static __forceinline__ -hypre_int hypre_gpu_get_num_warps(sycl::nd_item<1>& item) +hypre_int hypre_gpu_get_num_warps(sycl::nd_item& item) { return item.get_sub_group().get_group_range().get(0); } @@ -1172,9 +1174,9 @@ hypre_int hypre_gpu_get_num_warps(sycl::nd_item<1>& item) /* return the thread lane id in warp */ template static __forceinline__ -hypre_int hypre_gpu_get_lane_id(sycl::nd_item<1>& item) +hypre_int hypre_gpu_get_lane_id(sycl::nd_item& item) { - return item.get_local_linear_id() & (HYPRE_WARP_SIZE-1); + return item.get_sub_group().get_local_linear_id(); } // /* return the number of threads in grid */ @@ -1185,11 +1187,15 @@ hypre_int hypre_gpu_get_lane_id(sycl::nd_item<1>& item) // return hypre_gpu_get_num_blocks() * hypre_gpu_get_num_threads(); // } -/* return the flattened work-item/thread id in global work space */ -template +/* return the flattened work-item/thread id in global work space, + * Note: Since the use-cases always involved bdim = gdim = 1, the + * sycl:;nd_item<1> is only being used. SFINAE is used to prevent + * other dimensions (i.e., bdim != gdim != 1) */ +template < hypre_int bdim, hypre_int gdim > static __forceinline__ hypre_int hypre_gpu_get_grid_thread_id(sycl::nd_item<1>& item) { + static_assert(bdim == 1 && gdim == 1); return item.get_global_id(0); } diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index 54cc6e192e..dc85ff3f44 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -55,6 +55,8 @@ typedef sycl::range<1> dim3; #define __global__ +#define __host__ +#define __device__ /* WM: problems with this being inside extern C++ {} */ /* #include */ @@ -1107,7 +1109,7 @@ OutputIter hypreSycl_gather(InputIter1 map_first, InputIter1 map_last, /* return the number of (sub_groups) warps in (work-group) block */ template static __forceinline__ -hypre_int hypre_gpu_get_num_warps(sycl::nd_item<1>& item) +hypre_int hypre_gpu_get_num_warps(sycl::nd_item& item) { return item.get_sub_group().get_group_range().get(0); } @@ -1115,9 +1117,9 @@ hypre_int hypre_gpu_get_num_warps(sycl::nd_item<1>& item) /* return the thread lane id in warp */ template static __forceinline__ -hypre_int hypre_gpu_get_lane_id(sycl::nd_item<1>& item) +hypre_int hypre_gpu_get_lane_id(sycl::nd_item& item) { - return item.get_local_linear_id() & (HYPRE_WARP_SIZE-1); + return item.get_sub_group().get_local_linear_id(); } // /* return the number of threads in grid */ @@ -1128,11 +1130,15 @@ hypre_int hypre_gpu_get_lane_id(sycl::nd_item<1>& item) // return hypre_gpu_get_num_blocks() * hypre_gpu_get_num_threads(); // } -/* return the flattened work-item/thread id in global work space */ -template +/* return the flattened work-item/thread id in global work space, + * Note: Since the use-cases always involved bdim = gdim = 1, the + * sycl:;nd_item<1> is only being used. SFINAE is used to prevent + * other dimensions (i.e., bdim != gdim != 1) */ +template < hypre_int bdim, hypre_int gdim > static __forceinline__ hypre_int hypre_gpu_get_grid_thread_id(sycl::nd_item<1>& item) { + static_assert(bdim == 1 && gdim == 1); return item.get_global_id(0); } From ad32b6f15c775e8e9c7966edaa4f80683e4c94ee Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Tue, 21 Dec 2021 15:53:27 +0000 Subject: [PATCH 42/44] [SYCL] simplify namespace for sycl::ext::oneapi::sub_group to sycl::sub_group --- src/seq_mv/csr_matop_device.c | 8 ++++---- src/utilities/_hypre_utilities.hpp | 6 +++--- src/utilities/device_utils.h | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c index cd6e819515..55fde144e3 100644 --- a/src/seq_mv/csr_matop_device.c +++ b/src/seq_mv/csr_matop_device.c @@ -579,7 +579,7 @@ hypreGPUKernel_CSRMatrixFixZeroDiagDevice( sycl::nd_item<1>& item, { p = read_only_load(ia + row + lane); } - sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + sycl::sub_group SG = item.get_sub_group(); q = SG.shuffle(p, 1); p = SG.shuffle(p, 0); @@ -634,7 +634,7 @@ hypreGPUKernel_CSRMatrixReplaceDiagDevice( sycl::nd_item<1>& item, { p = read_only_load(ia + row + lane); } - sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + sycl::sub_group SG = item.get_sub_group(); q = SG.shuffle(p, 1); p = SG.shuffle(p, 0); @@ -697,7 +697,7 @@ hypreGPUKernel_CSRRowSum( sycl::nd_item<1>& item, p = read_only_load(ia + row_i + lane); } - sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + sycl::sub_group SG = item.get_sub_group(); q = SG.shuffle(p, 1); p = SG.shuffle(p, 0); @@ -769,7 +769,7 @@ hypreGPUKernel_CSRExtractDiag( sycl::nd_item<1>& item, { p = read_only_load(ia + row + lane); } - sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + sycl::sub_group SG = item.get_sub_group(); q = SG.shuffle(p, 1); p = SG.shuffle(p, 0); diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 23f05e1ad2..1a6f392e84 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -1285,11 +1285,11 @@ T read_only_load( const T *ptr ) // return in; // } -template +template static __forceinline__ -T warp_reduce_sum(T in, sycl::nd_item<1>& item) +T warp_reduce_sum(T in, sycl::nd_item& item) { - sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + sycl::sub_group SG = item.get_sub_group(); //sycl::ext::oneapi::reduce(SG, in, std::plus()); #pragma unroll for (hypre_int d = SG.get_local_range().get(0)/2; d > 0; d >>= 1) diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index dc85ff3f44..45006f9097 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -1228,11 +1228,11 @@ T read_only_load( const T *ptr ) // return in; // } -template +template static __forceinline__ -T warp_reduce_sum(T in, sycl::nd_item<1>& item) +T warp_reduce_sum(T in, sycl::nd_item& item) { - sycl::ext::oneapi::sub_group SG = item.get_sub_group(); + sycl::sub_group SG = item.get_sub_group(); //sycl::ext::oneapi::reduce(SG, in, std::plus()); #pragma unroll for (hypre_int d = SG.get_local_range().get(0)/2; d > 0; d >>= 1) From 9c6b6bc9988286869023c5514483720a2ded0ddd Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Tue, 21 Dec 2021 20:27:43 +0000 Subject: [PATCH 43/44] [SYCL] unify code for CUDA, HIP and SYCL for easier maintanence --- src/seq_mv/csr_matop_device.c | 3113 +++++++++++++-------------------- 1 file changed, 1204 insertions(+), 1909 deletions(-) diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c index 55fde144e3..bacc0b28fe 100644 --- a/src/seq_mv/csr_matop_device.c +++ b/src/seq_mv/csr_matop_device.c @@ -108,1531 +108,404 @@ hypre_GpuMatDataDestroy(hypre_GpuMatData *data) #endif /* #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) */ -/* ABB: All the compute kernel implementations are grouped here */ #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) -__global__ void -hypreGPUKernel_CSRMoveDiagFirst( HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *aa ) +HYPRE_Int +hypre_CSRMatrixSplitDevice_core( HYPRE_Int job, /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */ + HYPRE_Int num_rows, + HYPRE_Int B_ext_nnz, + HYPRE_Int *B_ext_ii, /* Note: this is NOT row pointers as in CSR but row indices as in COO */ + HYPRE_BigInt *B_ext_bigj, /* Note: [BigInt] global column indices */ + HYPRE_Complex *B_ext_data, + char *B_ext_xata, /* companion data with B_ext_data; NULL if none */ + HYPRE_BigInt first_col_diag_B, + HYPRE_BigInt last_col_diag_B, + HYPRE_Int num_cols_offd_B, + HYPRE_BigInt *col_map_offd_B, + HYPRE_Int **map_B_to_C_ptr, + HYPRE_Int *num_cols_offd_C_ptr, + HYPRE_BigInt **col_map_offd_C_ptr, + HYPRE_Int *B_ext_diag_nnz_ptr, + HYPRE_Int *B_ext_diag_ii, /* memory allocated outside */ + HYPRE_Int *B_ext_diag_j, + HYPRE_Complex *B_ext_diag_data, + char *B_ext_diag_xata, /* companion with B_ext_diag_data_ptr; NULL if none */ + HYPRE_Int *B_ext_offd_nnz_ptr, + HYPRE_Int *B_ext_offd_ii, /* memory allocated outside */ + HYPRE_Int *B_ext_offd_j, + HYPRE_Complex *B_ext_offd_data, + char *B_ext_offd_xata /* companion with B_ext_offd_data_ptr; NULL if none */ ) { - HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int B_ext_diag_nnz; + HYPRE_Int B_ext_offd_nnz; + HYPRE_BigInt *B_ext_diag_bigj = NULL; + HYPRE_BigInt *B_ext_offd_bigj = NULL; + HYPRE_BigInt *col_map_offd_C; + HYPRE_Int *map_B_to_C = NULL; + HYPRE_Int num_cols_offd_C; - if (row >= nrows) + in_range pred1(first_col_diag_B, last_col_diag_B); + + /* get diag and offd nnz */ + if (job == 0) { - return; - } + /* query the nnz's */ + B_ext_diag_nnz = HYPRE_THRUST_CALL( count_if, + B_ext_bigj, + B_ext_bigj + B_ext_nnz, + pred1 ); + B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz; - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; + *B_ext_diag_nnz_ptr = B_ext_diag_nnz; + *B_ext_offd_nnz_ptr = B_ext_offd_nnz; - if (lane < 2) + return hypre_error_flag; + } + else { - p = read_only_load(ia + row + lane); + B_ext_diag_nnz = *B_ext_diag_nnz_ptr; + B_ext_offd_nnz = *B_ext_offd_nnz_ptr; } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - for (HYPRE_Int j = p + lane + 1; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && ja[j] == row; + /* copy to diag */ + B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - if (find_diag) - { - ja[j] = ja[p]; - ja[p] = row; - HYPRE_Complex tmp = aa[p]; - aa[p] = aa[j]; - aa[j] = tmp; - } + if (B_ext_diag_xata) + { + auto new_end = HYPRE_THRUST_CALL( + copy_if, + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata)), /* first */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata)) + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata)), /* result */ + pred1 ); - if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) - { - break; - } + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz ); } -} - -/* check if diagonal entry is the first one at each row - * Return: the number of rows that do not have the first entry as diagonal - * RL: only check if it's a non-empty row - */ -__global__ void -hypreGPUKernel_CSRCheckDiagFirst( HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Int *result ) -{ - const HYPRE_Int row = hypre_cuda_get_grid_thread_id<1,1>(); - if (row < nrows) + else { - result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row); + auto new_end = HYPRE_THRUST_CALL( + copy_if, + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data)), /* first */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data)) + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data)), /* result */ + pred1 ); + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz ); } -} -__global__ void -hypreGPUKernel_CSRMatrixFixZeroDiagDevice( HYPRE_Complex v, - HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *data, - HYPRE_Real tol, - HYPRE_Int *result ) -{ - const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_THRUST_CALL( transform, + B_ext_diag_bigj, + B_ext_diag_bigj + B_ext_diag_nnz, + thrust::make_constant_iterator(first_col_diag_B), + B_ext_diag_j, + thrust::minus()); - if (row >= nrows) - { - return; - } + hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE); - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; - bool has_diag = false; + /* copy to offd */ + B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - if (lane < 2) + if (B_ext_offd_xata) { - p = read_only_load(ia + row + lane); - } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); + auto new_end = HYPRE_THRUST_CALL( + copy_if, + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata)), /* first */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata)) + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata)), /* result */ + thrust::not1(pred1) ); - for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); + } + else { - hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; - - if (find_diag) - { - if (fabs(data[j]) <= tol) - { - data[j] = v; - } - } + auto new_end = HYPRE_THRUST_CALL( + copy_if, + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data)), /* first */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data)) + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data)), /* result */ + thrust::not1(pred1) ); - if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) - { - has_diag = true; - break; - } + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); } - if (result && !has_diag && lane == 0) - { - result[row] = 1; - } -} + /* offd map of B_ext_offd Union col_map_offd_B */ + col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(col_map_offd_C, B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B, HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); -__global__ void -hypreGPUKernel_CSRMatrixReplaceDiagDevice( HYPRE_Complex *new_diag, - HYPRE_Complex v, - HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *data, - HYPRE_Real tol, - HYPRE_Int *result ) -{ - const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_THRUST_CALL( sort, + col_map_offd_C, + col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); - if (row >= nrows) - { - return; - } + HYPRE_BigInt *new_end = HYPRE_THRUST_CALL( unique, + col_map_offd_C, + col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; - bool has_diag = false; + num_cols_offd_C = new_end - col_map_offd_C; - if (lane < 2) - { - p = read_only_load(ia + row + lane); - } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); +#if 1 + HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE); + col_map_offd_C = tmp; +#else + col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE); +#endif - for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) + /* create map from col_map_offd_B */ + if (num_cols_offd_B) { - hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; + map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE); + HYPRE_THRUST_CALL( lower_bound, + col_map_offd_C, + col_map_offd_C + num_cols_offd_C, + col_map_offd_B, + col_map_offd_B + num_cols_offd_B, + map_B_to_C ); + } - if (find_diag) - { - HYPRE_Complex d = read_only_load(&new_diag[row]); - if (fabs(d) <= tol) - { - d = v; - } - data[j] = d; - } + HYPRE_THRUST_CALL( lower_bound, + col_map_offd_C, + col_map_offd_C + num_cols_offd_C, + B_ext_offd_bigj, + B_ext_offd_bigj + B_ext_offd_nnz, + B_ext_offd_j ); - if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) - { - has_diag = true; - break; - } - } + hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE); - if (result && !has_diag && lane == 0) + if (map_B_to_C_ptr) { - result[row] = 1; + *map_B_to_C_ptr = map_B_to_C; } + *num_cols_offd_C_ptr = num_cols_offd_C; + *col_map_offd_C_ptr = col_map_offd_C; + + return hypre_error_flag; } -/* type == 0, sum, - * 1, abs sum (l-1) - * 2, square sum (l-2) - */ -template -__global__ void -hypreGPUKernel_CSRRowSum( HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *aa, - HYPRE_Int *CF_i, - HYPRE_Int *CF_j, - HYPRE_Complex *row_sum, - HYPRE_Complex scal, - HYPRE_Int set) +typedef thrust::tuple Int2; +struct Int2Unequal : public thrust::unary_function { - HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>(); - - if (row_i >= nrows) + __host__ __device__ + bool operator()(const Int2& t) const { - return; + return (thrust::get<0>(t) != thrust::get<1>(t)); } +}; - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; - - if (lane < 2) +/* this predicate compares first and second element in a tuple in absolute value */ +/* first is assumed to be complex, second to be real > 0 */ +struct cabsfirst_greaterthan_second_pred : public thrust::unary_function,bool> +{ + __host__ __device__ + bool operator()(const thrust::tuple& t) const { - p = read_only_load(ia + row_i + lane); + const HYPRE_Complex i = thrust::get<0>(t); + const HYPRE_Real j = thrust::get<1>(t); + + return hypre_cabs(i) > j; } +}; - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); +#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */ - HYPRE_Complex row_sum_i = 0.0; +#if defined(HYPRE_USING_SYCL) - for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) { - if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) ) - { - continue; - } +HYPRE_Int +hypre_CSRMatrixSplitDevice_core( HYPRE_Int job, /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */ + HYPRE_Int num_rows, + HYPRE_Int B_ext_nnz, + HYPRE_Int *B_ext_ii, /* Note: this is NOT row pointers as in CSR but row indices as in COO */ + HYPRE_BigInt *B_ext_bigj, /* Note: [BigInt] global column indices */ + HYPRE_Complex *B_ext_data, + char *B_ext_xata, /* companion data with B_ext_data; NULL if none */ + HYPRE_BigInt first_col_diag_B, + HYPRE_BigInt last_col_diag_B, + HYPRE_Int num_cols_offd_B, + HYPRE_BigInt *col_map_offd_B, + HYPRE_Int **map_B_to_C_ptr, + HYPRE_Int *num_cols_offd_C_ptr, + HYPRE_BigInt **col_map_offd_C_ptr, + HYPRE_Int *B_ext_diag_nnz_ptr, + HYPRE_Int *B_ext_diag_ii, /* memory allocated outside */ + HYPRE_Int *B_ext_diag_j, + HYPRE_Complex *B_ext_diag_data, + char *B_ext_diag_xata, /* companion with B_ext_diag_data_ptr; NULL if none */ + HYPRE_Int *B_ext_offd_nnz_ptr, + HYPRE_Int *B_ext_offd_ii, /* memory allocated outside */ + HYPRE_Int *B_ext_offd_j, + HYPRE_Complex *B_ext_offd_data, + char *B_ext_offd_xata /* companion with B_ext_offd_data_ptr; NULL if none */ ) +{ + HYPRE_Int B_ext_diag_nnz; + HYPRE_Int B_ext_offd_nnz; + HYPRE_BigInt *B_ext_diag_bigj = NULL; + HYPRE_BigInt *B_ext_offd_bigj = NULL; + HYPRE_BigInt *col_map_offd_C; + HYPRE_Int *map_B_to_C = NULL; + HYPRE_Int num_cols_offd_C; - HYPRE_Complex aii = aa[j]; + in_range pred1(first_col_diag_B, last_col_diag_B); - if (type == 0) - { - row_sum_i += aii; - } - else if (type == 1) - { - row_sum_i += fabs(aii); - } - else if (type == 2) - { - row_sum_i += aii * aii; - } - } + /* get diag and offd nnz */ + if (job == 0) { + /* query the nnz's */ + B_ext_diag_nnz = HYPRE_ONEDPL_CALL( std::count_if, + B_ext_bigj, + B_ext_bigj + B_ext_nnz, + pred1 ); + B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz; - row_sum_i = warp_reduce_sum(row_sum_i); + *B_ext_diag_nnz_ptr = B_ext_diag_nnz; + *B_ext_offd_nnz_ptr = B_ext_offd_nnz; - if (lane == 0) - { - if (set) - { - row_sum[row_i] = scal * row_sum_i; - } - else - { - row_sum[row_i] += scal * row_sum_i; - } + return hypre_error_flag; + } + else { + B_ext_diag_nnz = *B_ext_diag_nnz_ptr; + B_ext_offd_nnz = *B_ext_offd_nnz_ptr; } -} -/* type 0: diag - * 1: abs diag - * 2: diag inverse - * 3: diag inverse sqrt - * 4: abs diag inverse sqrt - */ -__global__ void -hypreGPUKernel_CSRExtractDiag( HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *aa, - HYPRE_Complex *d, - HYPRE_Int type) -{ - HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + /* copy to diag */ + B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - if (row >= nrows) - { - return; - } + if (B_ext_diag_xata) { + auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, /* first */ + first + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata),/* result */ + pred1 ); - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; + //hypre_assert( std::get<0>(new_end.get_iterator_tuple() == B_ext_diag_ii + B_ext_diag_nnz ); + } + else { + auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, /* first */ + first + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data), /* result */ + pred1 ); - if (lane < 2) - { - p = read_only_load(ia + row + lane); + //hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz ); } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - HYPRE_Int has_diag = 0; + HYPRE_BigInt *const_iterator = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, first_col_diag_B, B_ext_diag_nnz*sizeof(HYPRE_BigInt)).wait(); + HYPRE_ONEDPL_CALL( std::transform, + B_ext_diag_bigj, + B_ext_diag_bigj + B_ext_diag_nnz, + const_iterator, //dpct::make_constant_iterator(first_col_diag_B), + B_ext_diag_j, + std::minus() ); + hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE); - for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && ja[j] == row; + hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE); - if (find_diag) - { - if (type == 0) - { - d[row] = aa[j]; - } - else if (type == 1) - { - d[row] = fabs(aa[j]); - } - else if (type == 2) - { - d[row] = 1.0 / aa[j]; - } - else if (type == 3) - { - d[row] = 1.0 / sqrt(aa[j]); - } - else if (type == 4) - { - d[row] = 1.0 / sqrt(fabs(aa[j])); - } - } + /* copy to offd */ + B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) - { - has_diag = 1; - break; - } + if (B_ext_offd_xata) { + auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, /* first */ + first + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata), /* result */ + std::not_fn(pred1) ); + + // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); } + else { + auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, /* first */ + first + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data), /* result */ + std::not_fn(pred1) ); - if (!has_diag && lane == 0) - { - d[row] = 0.0; + // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); } -} -/* mark is of size nA - * diag_option: 1: special treatment for diag entries, mark as -2 - */ -__global__ void -hypreGPUKernel_CSRMatrixIntersectPattern(HYPRE_Int n, - HYPRE_Int nA, - HYPRE_Int *rowid, - HYPRE_Int *colid, - HYPRE_Int *idx, - HYPRE_Int *mark, - HYPRE_Int diag_option) -{ - HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>(); + /* offd map of B_ext_offd Union col_map_offd_B */ + col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(col_map_offd_C, B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B, HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - if (i >= n) - { - return; - } + HYPRE_ONEDPL_CALL( std::sort, + col_map_offd_C, + col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); - HYPRE_Int r1 = read_only_load(&rowid[i]); - HYPRE_Int c1 = read_only_load(&colid[i]); - HYPRE_Int j = read_only_load(&idx[i]); + HYPRE_BigInt *new_end = HYPRE_ONEDPL_CALL( std::unique, + col_map_offd_C, + col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); - if (0 == diag_option) - { - if (j < nA) - { - HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; - HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; - if (r1 == r2 && c1 == c2) - { - mark[j] = c1; - } - else - { - mark[j] = -1; - } - } - } - else if (1 == diag_option) - { - if (j < nA) - { - if (r1 == c1) - { - mark[j] = -2; - } - else - { - HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; - HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; - if (r1 == r2 && c1 == c2) - { - mark[j] = c1; - } - else - { - mark[j] = -1; - } - } - } - } -} - -#elif defined(HYPRE_USING_SYCL) - -void -hypreGPUKernel_CSRMoveDiagFirst( sycl::nd_item<1>& item, - HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *aa ) -{ - HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); - - if (row >= nrows) - { - return; - } - - HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); - HYPRE_Int p = 0, q = 0; - - if (lane < 2) - { - p = read_only_load(ia + row + lane); - } - sycl::sub_group SG = item.get_sub_group(); - q = SG.shuffle(p, 1); - p = SG.shuffle(p, 0); - - for (HYPRE_Int j = p + lane + 1; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && ja[j] == row; - - if (find_diag) - { - ja[j] = ja[p]; - ja[p] = row; - HYPRE_Complex tmp = aa[p]; - aa[p] = aa[j]; - aa[j] = tmp; - } - - if ( sycl::any_of_group(SG, find_diag) ) - { - break; - } - } -} - -/* check if diagonal entry is the first one at each row - * Return: the number of rows that do not have the first entry as diagonal - * RL: only check if it's a non-empty row - */ -void -hypreGPUKernel_CSRCheckDiagFirst( sycl::nd_item<1>& item, - HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Int *result ) -{ - const HYPRE_Int row = hypre_gpu_get_grid_thread_id<1,1>(item); - if (row < nrows) - { - result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row); - } -} - -void -hypreGPUKernel_CSRMatrixFixZeroDiagDevice( sycl::nd_item<1>& item, - HYPRE_Complex v, - HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *data, - HYPRE_Real tol, - HYPRE_Int *result ) -{ - const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); - - if (row >= nrows) - { - return; - } - - HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); - HYPRE_Int p = 0, q = 0; - bool has_diag = false; - - if (lane < 2) - { - p = read_only_load(ia + row + lane); - } - sycl::sub_group SG = item.get_sub_group(); - q = SG.shuffle(p, 1); - p = SG.shuffle(p, 0); - - for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; - - if (find_diag) - { - if (fabs(data[j]) <= tol) - { - data[j] = v; - } - } - - if ( sycl::any_of_group(SG, find_diag) ) - { - has_diag = true; - break; - } - } - - if (result && !has_diag && lane == 0) - { - result[row] = 1; - } -} - -void -hypreGPUKernel_CSRMatrixReplaceDiagDevice( sycl::nd_item<1>& item, - HYPRE_Complex *new_diag, - HYPRE_Complex v, - HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *data, - HYPRE_Real tol, - HYPRE_Int *result ) -{ - const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); - - if (row >= nrows) - { - return; - } - - HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); - HYPRE_Int p = 0, q = 0; - bool has_diag = false; - - if (lane < 2) - { - p = read_only_load(ia + row + lane); - } - sycl::sub_group SG = item.get_sub_group(); - q = SG.shuffle(p, 1); - p = SG.shuffle(p, 0); - - for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; - - if (find_diag) - { - HYPRE_Complex d = read_only_load(&new_diag[row]); - if (fabs(d) <= tol) - { - d = v; - } - data[j] = d; - } - - if ( sycl::any_of_group(SG, find_diag) ) - { - has_diag = true; - break; - } - } - - if (result && !has_diag && lane == 0) - { - result[row] = 1; - } -} - -/* type == 0, sum, - * 1, abs sum (l-1) - * 2, square sum (l-2) - */ -template -void -hypreGPUKernel_CSRRowSum( sycl::nd_item<1>& item, - HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *aa, - HYPRE_Int *CF_i, - HYPRE_Int *CF_j, - HYPRE_Complex *row_sum, - HYPRE_Complex scal, - HYPRE_Int set) -{ - HYPRE_Int row_i = hypre_gpu_get_grid_warp_id<1,1>(item); - - if (row_i >= nrows) - { - return; - } - - HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); - HYPRE_Int p = 0, q = 0; - - if (lane < 2) - { - p = read_only_load(ia + row_i + lane); - } - - sycl::sub_group SG = item.get_sub_group(); - q = SG.shuffle(p, 1); - p = SG.shuffle(p, 0); - - HYPRE_Complex row_sum_i = 0.0; - - for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE) { - if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) ) - { - continue; - } - - HYPRE_Complex aii = aa[j]; - - if (type == 0) - { - row_sum_i += aii; - } - else if (type == 1) - { - row_sum_i += fabs(aii); - } - else if (type == 2) - { - row_sum_i += aii * aii; - } - } - - row_sum_i = warp_reduce_sum(row_sum_i, item); - - if (lane == 0) - { - if (set) - { - row_sum[row_i] = scal * row_sum_i; - } - else - { - row_sum[row_i] += scal * row_sum_i; - } - } -} - -/* type 0: diag - * 1: abs diag - * 2: diag inverse - * 3: diag inverse sqrt - * 4: abs diag inverse sqrt - */ -void -hypreGPUKernel_CSRExtractDiag( sycl::nd_item<1>& item, - HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *aa, - HYPRE_Complex *d, - HYPRE_Int type) -{ - HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); - - if (row >= nrows) - { - return; - } - - HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); - HYPRE_Int p = 0, q = 0; - - if (lane < 2) - { - p = read_only_load(ia + row + lane); - } - sycl::sub_group SG = item.get_sub_group(); - q = SG.shuffle(p, 1); - p = SG.shuffle(p, 0); - - HYPRE_Int has_diag = 0; - - for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && ja[j] == row; - - if (find_diag) - { - if (type == 0) - { - d[row] = aa[j]; - } - else if (type == 1) - { - d[row] = fabs(aa[j]); - } - else if (type == 2) - { - d[row] = 1.0 / aa[j]; - } - else if (type == 3) - { - d[row] = 1.0 / sqrt(aa[j]); - } - else if (type == 4) - { - d[row] = 1.0 / sqrt(fabs(aa[j])); - } - } - - if ( sycl::any_of_group(SG, find_diag) ) - { - has_diag = 1; - break; - } - } - - if (!has_diag && lane == 0) - { - d[row] = 0.0; - } -} - -/* mark is of size nA - * diag_option: 1: special treatment for diag entries, mark as -2 - */ -void -hypreGPUKernel_CSRMatrixIntersectPattern( sycl::nd_item<1>& item, - HYPRE_Int n, - HYPRE_Int nA, - HYPRE_Int *rowid, - HYPRE_Int *colid, - HYPRE_Int *idx, - HYPRE_Int *mark, - HYPRE_Int diag_option) -{ - HYPRE_Int i = hypre_gpu_get_grid_thread_id<1,1>(item); - - if (i >= n) - { - return; - } - - HYPRE_Int r1 = read_only_load(&rowid[i]); - HYPRE_Int c1 = read_only_load(&colid[i]); - HYPRE_Int j = read_only_load(&idx[i]); - - if (0 == diag_option) - { - if (j < nA) - { - HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; - HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; - if (r1 == r2 && c1 == c2) - { - mark[j] = c1; - } - else - { - mark[j] = -1; - } - } - } - else if (1 == diag_option) - { - if (j < nA) - { - if (r1 == c1) - { - mark[j] = -2; - } - else - { - HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; - HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; - if (r1 == r2 && c1 == c2) - { - mark[j] = c1; - } - else - { - mark[j] = -1; - } - } - } - } -} - -#endif // HYPRE_USING_SYCL - - -#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - -hypre_CSRMatrix* -hypre_CSRMatrixAddDevice ( HYPRE_Complex alpha, - hypre_CSRMatrix *A, - HYPRE_Complex beta, - hypre_CSRMatrix *B ) -{ - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Int nrows_A = hypre_CSRMatrixNumRows(A); - HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); - HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Complex *B_data = hypre_CSRMatrixData(B); - HYPRE_Int *B_i = hypre_CSRMatrixI(B); - HYPRE_Int *B_j = hypre_CSRMatrixJ(B); - HYPRE_Int nrows_B = hypre_CSRMatrixNumRows(B); - HYPRE_Int ncols_B = hypre_CSRMatrixNumCols(B); - HYPRE_Int nnz_B = hypre_CSRMatrixNumNonzeros(B); - HYPRE_Complex *C_data; - HYPRE_Int *C_i; - HYPRE_Int *C_j; - HYPRE_Int nnzC; - hypre_CSRMatrix *C; - - if (nrows_A != nrows_B || ncols_A != ncols_B) - { - hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! Incompatible matrix dimensions!\n"); - - return NULL; - } - - hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B, - A_i, A_j, alpha, A_data, NULL, B_i, B_j, beta, B_data, NULL, NULL, - &nnzC, &C_i, &C_j, &C_data); - - C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC); - hypre_CSRMatrixI(C) = C_i; - hypre_CSRMatrixJ(C) = C_j; - hypre_CSRMatrixData(C) = C_data; - hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return C; -} - -hypre_CSRMatrix* -hypre_CSRMatrixMultiplyDevice( hypre_CSRMatrix *A, - hypre_CSRMatrix *B) -{ - HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); - HYPRE_Int nrows_B = hypre_CSRMatrixNumRows(B); - hypre_CSRMatrix *C; - - if (ncols_A != nrows_B) - { - hypre_printf("Warning! incompatible matrix dimensions!\n"); - hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! incompatible matrix dimensions!\n"); - - return NULL; - } - - hypreDevice_CSRSpGemm(A, B, &C); - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return C; -} - -hypre_CSRMatrix* -hypre_CSRMatrixTripleMultiplyDevice ( hypre_CSRMatrix *A, - hypre_CSRMatrix *B, - hypre_CSRMatrix *C ) -{ - hypre_CSRMatrix *BC = hypre_CSRMatrixMultiplyDevice(B, C); - hypre_CSRMatrix *ABC = hypre_CSRMatrixMultiplyDevice(A, BC); - - hypre_CSRMatrixDestroy(BC); - - return ABC; -} - -HYPRE_Int -hypre_CSRMatrixTriLowerUpperSolveDevice(char uplo, - hypre_CSRMatrix *A, - HYPRE_Real *l1_norms, - hypre_Vector *f, - hypre_Vector *u ) -{ -#if defined(HYPRE_USING_CUSPARSE) - hypre_CSRMatrixTriLowerUpperSolveCusparse(uplo, A, l1_norms, f, u); -#elif defined(HYPRE_USING_ROCSPARSE) - hypre_CSRMatrixTriLowerUpperSolveRocsparse(uplo, A, l1_norms, f, u); -#else - hypre_error_w_msg(HYPRE_ERROR_GENERIC, "hypre_CSRMatrixTriLowerUpperSolveDevice requires configuration with either cusparse or rocsparse\n"); -#endif - return hypre_error_flag; -} - -/* split CSR matrix B_ext (extended rows of parcsr B) into diag part and offd part - * corresponding to B. - * Input col_map_offd_B: - * Output col_map_offd_C: union of col_map_offd_B and offd-indices of Bext_offd - * map_B_to_C: mapping from col_map_offd_B to col_map_offd_C - */ - -HYPRE_Int -hypre_CSRMatrixSplitDevice( hypre_CSRMatrix *B_ext, - HYPRE_BigInt first_col_diag_B, - HYPRE_BigInt last_col_diag_B, - HYPRE_Int num_cols_offd_B, - HYPRE_BigInt *col_map_offd_B, - HYPRE_Int **map_B_to_C_ptr, - HYPRE_Int *num_cols_offd_C_ptr, - HYPRE_BigInt **col_map_offd_C_ptr, - hypre_CSRMatrix **B_ext_diag_ptr, - hypre_CSRMatrix **B_ext_offd_ptr ) -{ - HYPRE_Int num_rows = hypre_CSRMatrixNumRows(B_ext); - HYPRE_Int B_ext_nnz = hypre_CSRMatrixNumNonzeros(B_ext); - - HYPRE_Int *B_ext_ii = hypre_TAlloc(HYPRE_Int, B_ext_nnz, HYPRE_MEMORY_DEVICE); - hypreDevice_CsrRowPtrsToIndices_v2(num_rows, B_ext_nnz, hypre_CSRMatrixI(B_ext), B_ext_ii); - - HYPRE_Int B_ext_diag_nnz; - HYPRE_Int B_ext_offd_nnz; - HYPRE_Int ierr; - - ierr = hypre_CSRMatrixSplitDevice_core( 0, - num_rows, - B_ext_nnz, - NULL, - hypre_CSRMatrixBigJ(B_ext), - NULL, - NULL, - first_col_diag_B, - last_col_diag_B, - num_cols_offd_B, - NULL, - NULL, - NULL, - NULL, - &B_ext_diag_nnz, - NULL, - NULL, - NULL, - NULL, - &B_ext_offd_nnz, - NULL, - NULL, - NULL, - NULL ); - - HYPRE_Int *B_ext_diag_ii = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Int *B_ext_diag_j = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Complex *B_ext_diag_a = hypre_TAlloc(HYPRE_Complex, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - - HYPRE_Int *B_ext_offd_ii = hypre_TAlloc(HYPRE_Int, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Int *B_ext_offd_j = hypre_TAlloc(HYPRE_Int, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Complex *B_ext_offd_a = hypre_TAlloc(HYPRE_Complex, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - - ierr = hypre_CSRMatrixSplitDevice_core( 1, - num_rows, - B_ext_nnz, - B_ext_ii, - hypre_CSRMatrixBigJ(B_ext), - hypre_CSRMatrixData(B_ext), - NULL, - first_col_diag_B, - last_col_diag_B, - num_cols_offd_B, - col_map_offd_B, - map_B_to_C_ptr, - num_cols_offd_C_ptr, - col_map_offd_C_ptr, - &B_ext_diag_nnz, - B_ext_diag_ii, - B_ext_diag_j, - B_ext_diag_a, - NULL, - &B_ext_offd_nnz, - B_ext_offd_ii, - B_ext_offd_j, - B_ext_offd_a, - NULL ); - - hypre_TFree(B_ext_ii, HYPRE_MEMORY_DEVICE); - - /* convert to row ptrs */ - HYPRE_Int *B_ext_diag_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_diag_nnz, B_ext_diag_ii); - HYPRE_Int *B_ext_offd_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_offd_nnz, B_ext_offd_ii); - - hypre_TFree(B_ext_diag_ii, HYPRE_MEMORY_DEVICE); - hypre_TFree(B_ext_offd_ii, HYPRE_MEMORY_DEVICE); - - /* create diag and offd CSR */ - hypre_CSRMatrix *B_ext_diag = hypre_CSRMatrixCreate(num_rows, last_col_diag_B - first_col_diag_B + 1, B_ext_diag_nnz); - hypre_CSRMatrix *B_ext_offd = hypre_CSRMatrixCreate(num_rows, *num_cols_offd_C_ptr, B_ext_offd_nnz); - - hypre_CSRMatrixI(B_ext_diag) = B_ext_diag_i; - hypre_CSRMatrixJ(B_ext_diag) = B_ext_diag_j; - hypre_CSRMatrixData(B_ext_diag) = B_ext_diag_a; - hypre_CSRMatrixNumNonzeros(B_ext_diag) = B_ext_diag_nnz; - hypre_CSRMatrixMemoryLocation(B_ext_diag) = HYPRE_MEMORY_DEVICE; - - hypre_CSRMatrixI(B_ext_offd) = B_ext_offd_i; - hypre_CSRMatrixJ(B_ext_offd) = B_ext_offd_j; - hypre_CSRMatrixData(B_ext_offd) = B_ext_offd_a; - hypre_CSRMatrixNumNonzeros(B_ext_offd) = B_ext_offd_nnz; - hypre_CSRMatrixMemoryLocation(B_ext_offd) = HYPRE_MEMORY_DEVICE; - - *B_ext_diag_ptr = B_ext_diag; - *B_ext_offd_ptr = B_ext_offd; - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return ierr; -} - -HYPRE_Int -hypre_CSRMatrixSplitDevice_core( HYPRE_Int job, /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */ - HYPRE_Int num_rows, - HYPRE_Int B_ext_nnz, - HYPRE_Int *B_ext_ii, /* Note: this is NOT row pointers as in CSR but row indices as in COO */ - HYPRE_BigInt *B_ext_bigj, /* Note: [BigInt] global column indices */ - HYPRE_Complex *B_ext_data, - char *B_ext_xata, /* companion data with B_ext_data; NULL if none */ - HYPRE_BigInt first_col_diag_B, - HYPRE_BigInt last_col_diag_B, - HYPRE_Int num_cols_offd_B, - HYPRE_BigInt *col_map_offd_B, - HYPRE_Int **map_B_to_C_ptr, - HYPRE_Int *num_cols_offd_C_ptr, - HYPRE_BigInt **col_map_offd_C_ptr, - HYPRE_Int *B_ext_diag_nnz_ptr, - HYPRE_Int *B_ext_diag_ii, /* memory allocated outside */ - HYPRE_Int *B_ext_diag_j, - HYPRE_Complex *B_ext_diag_data, - char *B_ext_diag_xata, /* companion with B_ext_diag_data_ptr; NULL if none */ - HYPRE_Int *B_ext_offd_nnz_ptr, - HYPRE_Int *B_ext_offd_ii, /* memory allocated outside */ - HYPRE_Int *B_ext_offd_j, - HYPRE_Complex *B_ext_offd_data, - char *B_ext_offd_xata /* companion with B_ext_offd_data_ptr; NULL if none */ ) -{ - HYPRE_Int B_ext_diag_nnz; - HYPRE_Int B_ext_offd_nnz; - HYPRE_BigInt *B_ext_diag_bigj = NULL; - HYPRE_BigInt *B_ext_offd_bigj = NULL; - HYPRE_BigInt *col_map_offd_C; - HYPRE_Int *map_B_to_C = NULL; - HYPRE_Int num_cols_offd_C; - - in_range pred1(first_col_diag_B, last_col_diag_B); - - /* get diag and offd nnz */ - if (job == 0) - { - /* query the nnz's */ - B_ext_diag_nnz = HYPRE_THRUST_CALL( count_if, - B_ext_bigj, - B_ext_bigj + B_ext_nnz, - pred1 ); - B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz; - - *B_ext_diag_nnz_ptr = B_ext_diag_nnz; - *B_ext_offd_nnz_ptr = B_ext_offd_nnz; - - return hypre_error_flag; - } - else - { - B_ext_diag_nnz = *B_ext_diag_nnz_ptr; - B_ext_offd_nnz = *B_ext_offd_nnz_ptr; - } - - /* copy to diag */ - B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - - if (B_ext_diag_xata) - { - auto new_end = HYPRE_THRUST_CALL( - copy_if, - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata)), /* first */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata)) + B_ext_nnz, /* last */ - B_ext_bigj, /* stencil */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata)), /* result */ - pred1 ); - - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz ); - } - else - { - auto new_end = HYPRE_THRUST_CALL( - copy_if, - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data)), /* first */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data)) + B_ext_nnz, /* last */ - B_ext_bigj, /* stencil */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data)), /* result */ - pred1 ); - - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz ); - } - - HYPRE_THRUST_CALL( transform, - B_ext_diag_bigj, - B_ext_diag_bigj + B_ext_diag_nnz, - thrust::make_constant_iterator(first_col_diag_B), - B_ext_diag_j, - thrust::minus()); - - hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE); - - /* copy to offd */ - B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - - if (B_ext_offd_xata) - { - auto new_end = HYPRE_THRUST_CALL( - copy_if, - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata)), /* first */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata)) + B_ext_nnz, /* last */ - B_ext_bigj, /* stencil */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata)), /* result */ - thrust::not1(pred1) ); - - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); - } - else - { - auto new_end = HYPRE_THRUST_CALL( - copy_if, - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data)), /* first */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data)) + B_ext_nnz, /* last */ - B_ext_bigj, /* stencil */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data)), /* result */ - thrust::not1(pred1) ); - - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); - } - - /* offd map of B_ext_offd Union col_map_offd_B */ - col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(col_map_offd_C, B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B, HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - - HYPRE_THRUST_CALL( sort, - col_map_offd_C, - col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); - - HYPRE_BigInt *new_end = HYPRE_THRUST_CALL( unique, - col_map_offd_C, - col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); - - num_cols_offd_C = new_end - col_map_offd_C; - -#if 1 - HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE); - col_map_offd_C = tmp; -#else - col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE); -#endif - - /* create map from col_map_offd_B */ - if (num_cols_offd_B) - { - map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL( lower_bound, - col_map_offd_C, - col_map_offd_C + num_cols_offd_C, - col_map_offd_B, - col_map_offd_B + num_cols_offd_B, - map_B_to_C ); - } - - HYPRE_THRUST_CALL( lower_bound, - col_map_offd_C, - col_map_offd_C + num_cols_offd_C, - B_ext_offd_bigj, - B_ext_offd_bigj + B_ext_offd_nnz, - B_ext_offd_j ); - - hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE); - - if (map_B_to_C_ptr) - { - *map_B_to_C_ptr = map_B_to_C; - } - *num_cols_offd_C_ptr = num_cols_offd_C; - *col_map_offd_C_ptr = col_map_offd_C; - - return hypre_error_flag; -} - -/*-------------------------------------------------------------------------- - * hypre_CSRMatrixAddPartial: - * adds matrix rows in the CSR matrix B to the CSR Matrix A, where row_nums[i] - * defines to which row of A the i-th row of B is added, and returns a CSR Matrix C; - * Repeated row indices are allowed in row_nums - * Note: The routine does not check for 0-elements which might be generated - * through cancellation of elements in A and B or already contained - * in A and B. To remove those, use hypre_CSRMatrixDeleteZeros - *--------------------------------------------------------------------------*/ - -hypre_CSRMatrix* -hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A, - hypre_CSRMatrix *B, - HYPRE_Int *row_nums) -{ - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Int nrows_A = hypre_CSRMatrixNumRows(A); - HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); - HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Complex *B_data = hypre_CSRMatrixData(B); - HYPRE_Int *B_i = hypre_CSRMatrixI(B); - HYPRE_Int *B_j = hypre_CSRMatrixJ(B); - HYPRE_Int nrows_B = hypre_CSRMatrixNumRows(B); - HYPRE_Int ncols_B = hypre_CSRMatrixNumCols(B); - HYPRE_Int nnz_B = hypre_CSRMatrixNumNonzeros(B); - HYPRE_Complex *C_data; - HYPRE_Int *C_i; - HYPRE_Int *C_j; - HYPRE_Int nnzC; - hypre_CSRMatrix *C; - - if (ncols_A != ncols_B) - { - hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! Incompatible matrix dimensions!\n"); - - return NULL; - } - - hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B, A_i, A_j, 1.0, A_data, NULL, B_i, B_j, 1.0, B_data, NULL, row_nums, - &nnzC, &C_i, &C_j, &C_data); - - C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC); - hypre_CSRMatrixI(C) = C_i; - hypre_CSRMatrixJ(C) = C_j; - hypre_CSRMatrixData(C) = C_data; - hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return C; -} - -HYPRE_Int -hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A, - HYPRE_Real *colnnz) -{ - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); - HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int *A_j_sorted; - HYPRE_Int num_reduced_col_indices; - HYPRE_Int *reduced_col_indices; - HYPRE_Int *reduced_col_nnz; - - A_j_sorted = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(A_j_sorted, A_j, HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL(sort, A_j_sorted, A_j_sorted + nnz_A); - - reduced_col_indices = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); - reduced_col_nnz = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); - - thrust::pair new_end = - HYPRE_THRUST_CALL(reduce_by_key, A_j_sorted, A_j_sorted + nnz_A, - thrust::make_constant_iterator(1), - reduced_col_indices, - reduced_col_nnz); - - hypre_assert(new_end.first - reduced_col_indices == new_end.second - reduced_col_nnz); - - num_reduced_col_indices = new_end.first - reduced_col_indices; - - hypre_Memset(colnnz, 0, ncols_A * sizeof(HYPRE_Real), HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL(scatter, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices, - reduced_col_indices, colnnz); - - hypre_TFree(A_j_sorted, HYPRE_MEMORY_DEVICE); - hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE); - hypre_TFree(reduced_col_nnz, HYPRE_MEMORY_DEVICE); - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return hypre_error_flag; -} - -typedef thrust::tuple Int2; -struct Int2Unequal : public thrust::unary_function -{ - __host__ __device__ - bool operator()(const Int2& t) const - { - return (thrust::get<0>(t) != thrust::get<1>(t)); - } -}; - -HYPRE_Int -hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A) -{ - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); - HYPRE_Int new_nnz; - HYPRE_Int *new_ii; - HYPRE_Int *new_j; - HYPRE_Complex *new_data; - - new_nnz = HYPRE_THRUST_CALL( count_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz, - Int2Unequal() ); - - if (new_nnz == nnz) - { - /* no diagonal entries found */ - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - return hypre_error_flag; - } - - new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - - if (A_data) - { - new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); - - thrust::zip_iterator< thrust::tuple > new_end; - - new_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), - thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), - Int2Unequal() ); - - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); - } - else - { - new_data = NULL; - - thrust::zip_iterator< thrust::tuple > new_end; - - new_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), - thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j)), - Int2Unequal() ); - - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); - } - - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); - - hypre_CSRMatrixNumNonzeros(A) = new_nnz; - hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); - hypre_CSRMatrixJ(A) = new_j; - hypre_CSRMatrixData(A) = new_data; - hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); - - return hypre_error_flag; -} - -/* return C = [A; B] */ -hypre_CSRMatrix* -hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B) -{ - hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) ); - - hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B), - hypre_CSRMatrixNumCols(A), - hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) ); - - HYPRE_Int *C_i = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE); - HYPRE_Int *C_j = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); - HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); - - hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1, - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL( transform, - C_i + hypre_CSRMatrixNumRows(A) + 1, - C_i + hypre_CSRMatrixNumRows(C) + 1, - thrust::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)), - C_i + hypre_CSRMatrixNumRows(A) + 1, - thrust::plus() ); - - hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - - hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - - hypre_CSRMatrixI(C) = C_i; - hypre_CSRMatrixJ(C) = C_j; - hypre_CSRMatrixData(C) = C_a; - hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - - return C; -} - -/* A = alp * I */ -hypre_CSRMatrix * -hypre_CSRMatrixIdentityDevice(HYPRE_Int n, HYPRE_Complex alp) -{ - hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n); - - hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE); - - HYPRE_THRUST_CALL( sequence, - hypre_CSRMatrixI(A), - hypre_CSRMatrixI(A) + n + 1, - 0 ); - - HYPRE_THRUST_CALL( sequence, - hypre_CSRMatrixJ(A), - hypre_CSRMatrixJ(A) + n, - 0 ); - - HYPRE_THRUST_CALL( fill, - hypre_CSRMatrixData(A), - hypre_CSRMatrixData(A) + n, - alp ); - - return A; -} - -/* this predicate compares first and second element in a tuple in absolute value */ -/* first is assumed to be complex, second to be real > 0 */ -struct cabsfirst_greaterthan_second_pred : public thrust::unary_function,bool> -{ - __host__ __device__ - bool operator()(const thrust::tuple& t) const - { - const HYPRE_Complex i = thrust::get<0>(t); - const HYPRE_Real j = thrust::get<1>(t); - - return hypre_cabs(i) > j; - } -}; - -/* drop the entries that are smaller than: - * tol if elmt_tols == null, - * elmt_tols[j] otherwise where j = 0...NumNonzeros(A) */ -HYPRE_Int -hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A, - HYPRE_Real tol, - HYPRE_Real *elmt_tols) -{ - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_ii = NULL; - HYPRE_Int new_nnz = 0; - HYPRE_Int *new_ii; - HYPRE_Int *new_j; - HYPRE_Complex *new_data; - - if (elmt_tols == NULL) - { - new_nnz = HYPRE_THRUST_CALL( count_if, - A_data, - A_data + nnz, - thrust::not1(less_than(tol)) ); - } - else - { - new_nnz = HYPRE_THRUST_CALL( count_if, - thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)), - thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)) + nnz, - cabsfirst_greaterthan_second_pred() ); - } - - if (new_nnz == nnz) - { - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - return hypre_error_flag; - } - - if (!A_ii) - { - A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); - } - new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); - - thrust::zip_iterator< thrust::tuple > new_end; + num_cols_offd_C = new_end - col_map_offd_C; - if (elmt_tols == NULL) - { - new_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, - A_data, - thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), - thrust::not1(less_than(tol)) ); - } - else - { - new_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, - thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)), - thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), - cabsfirst_greaterthan_second_pred() ); +#if 1 + HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE, + HYPRE_MEMORY_DEVICE); + hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE); + col_map_offd_C = tmp; +#else + col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, + HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE); +#endif + + /* create map from col_map_offd_B */ + if (num_cols_offd_B) { + map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE); + HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, + col_map_offd_C, + col_map_offd_C + num_cols_offd_C, + col_map_offd_B, + col_map_offd_B + num_cols_offd_B, + map_B_to_C ); } - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); + HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, + col_map_offd_C, + col_map_offd_C + num_cols_offd_C, + B_ext_offd_bigj, + B_ext_offd_bigj + B_ext_offd_nnz, + B_ext_offd_j ); - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); + hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE); - hypre_CSRMatrixNumNonzeros(A) = new_nnz; - hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); - hypre_CSRMatrixJ(A) = new_j; - hypre_CSRMatrixData(A) = new_data; - hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); + if (map_B_to_C_ptr) { + *map_B_to_C_ptr = map_B_to_C; + } + *num_cols_offd_C_ptr = num_cols_offd_C; + *col_map_offd_C_ptr = col_map_offd_C; return hypre_error_flag; } -#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */ +/* this predicate compares first and second element in a tuple in absolute value */ +/* first is assumed to be complex, second to be real > 0 */ +struct cabsfirst_greaterthan_second_pred +{ + bool operator()(const std::tuple& t) const + { + const HYPRE_Complex i = std::get<0>(t); + const HYPRE_Real j = std::get<1>(t); + + return hypre_cabs(i) > j; + } +}; + +#endif /* HYPRE_USING_SYCL */ -#if defined(HYPRE_USING_SYCL) + +#if defined(HYPRE_USING_GPU) hypre_CSRMatrix* hypre_CSRMatrixAddDevice ( HYPRE_Complex alpha, @@ -1759,272 +632,98 @@ hypre_CSRMatrixSplitDevice( hypre_CSRMatrix *B_ext, hypreDevice_CsrRowPtrsToIndices_v2(num_rows, B_ext_nnz, hypre_CSRMatrixI(B_ext), B_ext_ii); HYPRE_Int B_ext_diag_nnz; - HYPRE_Int B_ext_offd_nnz; - HYPRE_Int ierr; - - ierr = hypre_CSRMatrixSplitDevice_core( 0, - num_rows, - B_ext_nnz, - NULL, - hypre_CSRMatrixBigJ(B_ext), - NULL, - NULL, - first_col_diag_B, - last_col_diag_B, - num_cols_offd_B, - NULL, - NULL, - NULL, - NULL, - &B_ext_diag_nnz, - NULL, - NULL, - NULL, - NULL, - &B_ext_offd_nnz, - NULL, - NULL, - NULL, - NULL ); - - HYPRE_Int *B_ext_diag_ii = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Int *B_ext_diag_j = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Complex *B_ext_diag_a = hypre_TAlloc(HYPRE_Complex, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - - HYPRE_Int *B_ext_offd_ii = hypre_TAlloc(HYPRE_Int, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Int *B_ext_offd_j = hypre_TAlloc(HYPRE_Int, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Complex *B_ext_offd_a = hypre_TAlloc(HYPRE_Complex, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - - ierr = hypre_CSRMatrixSplitDevice_core( 1, - num_rows, - B_ext_nnz, - B_ext_ii, - hypre_CSRMatrixBigJ(B_ext), - hypre_CSRMatrixData(B_ext), - NULL, - first_col_diag_B, - last_col_diag_B, - num_cols_offd_B, - col_map_offd_B, - map_B_to_C_ptr, - num_cols_offd_C_ptr, - col_map_offd_C_ptr, - &B_ext_diag_nnz, - B_ext_diag_ii, - B_ext_diag_j, - B_ext_diag_a, - NULL, - &B_ext_offd_nnz, - B_ext_offd_ii, - B_ext_offd_j, - B_ext_offd_a, - NULL ); - - hypre_TFree(B_ext_ii, HYPRE_MEMORY_DEVICE); - - /* convert to row ptrs */ - HYPRE_Int *B_ext_diag_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_diag_nnz, B_ext_diag_ii); - HYPRE_Int *B_ext_offd_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_offd_nnz, B_ext_offd_ii); - - hypre_TFree(B_ext_diag_ii, HYPRE_MEMORY_DEVICE); - hypre_TFree(B_ext_offd_ii, HYPRE_MEMORY_DEVICE); - - /* create diag and offd CSR */ - hypre_CSRMatrix *B_ext_diag = hypre_CSRMatrixCreate(num_rows, last_col_diag_B - first_col_diag_B + 1, B_ext_diag_nnz); - hypre_CSRMatrix *B_ext_offd = hypre_CSRMatrixCreate(num_rows, *num_cols_offd_C_ptr, B_ext_offd_nnz); - - hypre_CSRMatrixI(B_ext_diag) = B_ext_diag_i; - hypre_CSRMatrixJ(B_ext_diag) = B_ext_diag_j; - hypre_CSRMatrixData(B_ext_diag) = B_ext_diag_a; - hypre_CSRMatrixNumNonzeros(B_ext_diag) = B_ext_diag_nnz; - hypre_CSRMatrixMemoryLocation(B_ext_diag) = HYPRE_MEMORY_DEVICE; - - hypre_CSRMatrixI(B_ext_offd) = B_ext_offd_i; - hypre_CSRMatrixJ(B_ext_offd) = B_ext_offd_j; - hypre_CSRMatrixData(B_ext_offd) = B_ext_offd_a; - hypre_CSRMatrixNumNonzeros(B_ext_offd) = B_ext_offd_nnz; - hypre_CSRMatrixMemoryLocation(B_ext_offd) = HYPRE_MEMORY_DEVICE; - - *B_ext_diag_ptr = B_ext_diag; - *B_ext_offd_ptr = B_ext_offd; - - hypre_SyncDeviceComputeStream(hypre_handle()); - - return ierr; -} - -HYPRE_Int -hypre_CSRMatrixSplitDevice_core( HYPRE_Int job, /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */ - HYPRE_Int num_rows, - HYPRE_Int B_ext_nnz, - HYPRE_Int *B_ext_ii, /* Note: this is NOT row pointers as in CSR but row indices as in COO */ - HYPRE_BigInt *B_ext_bigj, /* Note: [BigInt] global column indices */ - HYPRE_Complex *B_ext_data, - char *B_ext_xata, /* companion data with B_ext_data; NULL if none */ - HYPRE_BigInt first_col_diag_B, - HYPRE_BigInt last_col_diag_B, - HYPRE_Int num_cols_offd_B, - HYPRE_BigInt *col_map_offd_B, - HYPRE_Int **map_B_to_C_ptr, - HYPRE_Int *num_cols_offd_C_ptr, - HYPRE_BigInt **col_map_offd_C_ptr, - HYPRE_Int *B_ext_diag_nnz_ptr, - HYPRE_Int *B_ext_diag_ii, /* memory allocated outside */ - HYPRE_Int *B_ext_diag_j, - HYPRE_Complex *B_ext_diag_data, - char *B_ext_diag_xata, /* companion with B_ext_diag_data_ptr; NULL if none */ - HYPRE_Int *B_ext_offd_nnz_ptr, - HYPRE_Int *B_ext_offd_ii, /* memory allocated outside */ - HYPRE_Int *B_ext_offd_j, - HYPRE_Complex *B_ext_offd_data, - char *B_ext_offd_xata /* companion with B_ext_offd_data_ptr; NULL if none */ ) -{ - HYPRE_Int B_ext_diag_nnz; - HYPRE_Int B_ext_offd_nnz; - HYPRE_BigInt *B_ext_diag_bigj = NULL; - HYPRE_BigInt *B_ext_offd_bigj = NULL; - HYPRE_BigInt *col_map_offd_C; - HYPRE_Int *map_B_to_C = NULL; - HYPRE_Int num_cols_offd_C; - - in_range pred1(first_col_diag_B, last_col_diag_B); - - /* get diag and offd nnz */ - if (job == 0) { - /* query the nnz's */ - B_ext_diag_nnz = HYPRE_ONEDPL_CALL( std::count_if, - B_ext_bigj, - B_ext_bigj + B_ext_nnz, - pred1 ); - B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz; - - *B_ext_diag_nnz_ptr = B_ext_diag_nnz; - *B_ext_offd_nnz_ptr = B_ext_offd_nnz; - - return hypre_error_flag; - } - else { - B_ext_diag_nnz = *B_ext_diag_nnz_ptr; - B_ext_offd_nnz = *B_ext_offd_nnz_ptr; - } - - /* copy to diag */ - B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - - if (B_ext_diag_xata) { - auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata); - auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, - first, /* first */ - first + B_ext_nnz, /* last */ - B_ext_bigj, /* stencil */ - oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata),/* result */ - pred1 ); - - //hypre_assert( std::get<0>(new_end.get_iterator_tuple() == B_ext_diag_ii + B_ext_diag_nnz ); - } - else { - auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data); - auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, - first, /* first */ - first + B_ext_nnz, /* last */ - B_ext_bigj, /* stencil */ - oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data), /* result */ - pred1 ); - - //hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz ); - } - - HYPRE_BigInt *const_iterator = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, first_col_diag_B, B_ext_diag_nnz*sizeof(HYPRE_BigInt)).wait(); - HYPRE_ONEDPL_CALL( std::transform, - B_ext_diag_bigj, - B_ext_diag_bigj + B_ext_diag_nnz, - const_iterator, //dpct::make_constant_iterator(first_col_diag_B), - B_ext_diag_j, - std::minus() ); - hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE); - - hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE); - - /* copy to offd */ - B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - - if (B_ext_offd_xata) { - auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata); - auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, - first, /* first */ - first + B_ext_nnz, /* last */ - B_ext_bigj, /* stencil */ - oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata), /* result */ - std::not_fn(pred1) ); + HYPRE_Int B_ext_offd_nnz; + HYPRE_Int ierr; - // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); - } - else { - auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data); - auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, - first, /* first */ - first + B_ext_nnz, /* last */ - B_ext_bigj, /* stencil */ - oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data), /* result */ - std::not_fn(pred1) ); + ierr = hypre_CSRMatrixSplitDevice_core( 0, + num_rows, + B_ext_nnz, + NULL, + hypre_CSRMatrixBigJ(B_ext), + NULL, + NULL, + first_col_diag_B, + last_col_diag_B, + num_cols_offd_B, + NULL, + NULL, + NULL, + NULL, + &B_ext_diag_nnz, + NULL, + NULL, + NULL, + NULL, + &B_ext_offd_nnz, + NULL, + NULL, + NULL, + NULL ); - // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); - } + HYPRE_Int *B_ext_diag_ii = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Int *B_ext_diag_j = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Complex *B_ext_diag_a = hypre_TAlloc(HYPRE_Complex, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - /* offd map of B_ext_offd Union col_map_offd_B */ - col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(col_map_offd_C, B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B, HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + HYPRE_Int *B_ext_offd_ii = hypre_TAlloc(HYPRE_Int, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Int *B_ext_offd_j = hypre_TAlloc(HYPRE_Int, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Complex *B_ext_offd_a = hypre_TAlloc(HYPRE_Complex, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_ONEDPL_CALL( std::sort, - col_map_offd_C, - col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); + ierr = hypre_CSRMatrixSplitDevice_core( 1, + num_rows, + B_ext_nnz, + B_ext_ii, + hypre_CSRMatrixBigJ(B_ext), + hypre_CSRMatrixData(B_ext), + NULL, + first_col_diag_B, + last_col_diag_B, + num_cols_offd_B, + col_map_offd_B, + map_B_to_C_ptr, + num_cols_offd_C_ptr, + col_map_offd_C_ptr, + &B_ext_diag_nnz, + B_ext_diag_ii, + B_ext_diag_j, + B_ext_diag_a, + NULL, + &B_ext_offd_nnz, + B_ext_offd_ii, + B_ext_offd_j, + B_ext_offd_a, + NULL ); - HYPRE_BigInt *new_end = HYPRE_ONEDPL_CALL( std::unique, - col_map_offd_C, - col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); + hypre_TFree(B_ext_ii, HYPRE_MEMORY_DEVICE); - num_cols_offd_C = new_end - col_map_offd_C; + /* convert to row ptrs */ + HYPRE_Int *B_ext_diag_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_diag_nnz, B_ext_diag_ii); + HYPRE_Int *B_ext_offd_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_offd_nnz, B_ext_offd_ii); -#if 1 - HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE, - HYPRE_MEMORY_DEVICE); - hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE); - col_map_offd_C = tmp; -#else - col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, - HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE); -#endif + hypre_TFree(B_ext_diag_ii, HYPRE_MEMORY_DEVICE); + hypre_TFree(B_ext_offd_ii, HYPRE_MEMORY_DEVICE); - /* create map from col_map_offd_B */ - if (num_cols_offd_B) { - map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE); - HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, - col_map_offd_C, - col_map_offd_C + num_cols_offd_C, - col_map_offd_B, - col_map_offd_B + num_cols_offd_B, - map_B_to_C ); - } + /* create diag and offd CSR */ + hypre_CSRMatrix *B_ext_diag = hypre_CSRMatrixCreate(num_rows, last_col_diag_B - first_col_diag_B + 1, B_ext_diag_nnz); + hypre_CSRMatrix *B_ext_offd = hypre_CSRMatrixCreate(num_rows, *num_cols_offd_C_ptr, B_ext_offd_nnz); - HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, - col_map_offd_C, - col_map_offd_C + num_cols_offd_C, - B_ext_offd_bigj, - B_ext_offd_bigj + B_ext_offd_nnz, - B_ext_offd_j ); + hypre_CSRMatrixI(B_ext_diag) = B_ext_diag_i; + hypre_CSRMatrixJ(B_ext_diag) = B_ext_diag_j; + hypre_CSRMatrixData(B_ext_diag) = B_ext_diag_a; + hypre_CSRMatrixNumNonzeros(B_ext_diag) = B_ext_diag_nnz; + hypre_CSRMatrixMemoryLocation(B_ext_diag) = HYPRE_MEMORY_DEVICE; - hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE); + hypre_CSRMatrixI(B_ext_offd) = B_ext_offd_i; + hypre_CSRMatrixJ(B_ext_offd) = B_ext_offd_j; + hypre_CSRMatrixData(B_ext_offd) = B_ext_offd_a; + hypre_CSRMatrixNumNonzeros(B_ext_offd) = B_ext_offd_nnz; + hypre_CSRMatrixMemoryLocation(B_ext_offd) = HYPRE_MEMORY_DEVICE; - if (map_B_to_C_ptr) { - *map_B_to_C_ptr = map_B_to_C; - } - *num_cols_offd_C_ptr = num_cols_offd_C; - *col_map_offd_C_ptr = col_map_offd_C; + *B_ext_diag_ptr = B_ext_diag; + *B_ext_offd_ptr = B_ext_offd; - return hypre_error_flag; + hypre_SyncDeviceComputeStream(hypre_handle()); + + return ierr; } /*-------------------------------------------------------------------------- @@ -2062,7 +761,7 @@ hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A, if (ncols_A != ncols_B) { - hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! incompatible matrix dimensions!\n"); + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! Incompatible matrix dimensions!\n"); return NULL; } @@ -2092,15 +791,15 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A, HYPRE_Int num_reduced_col_indices; HYPRE_Int *reduced_col_indices; HYPRE_Int *reduced_col_nnz; + reduced_col_indices = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); + reduced_col_nnz = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); A_j_sorted = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE); hypre_TMemcpy(A_j_sorted, A_j, HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - HYPRE_ONEDPL_CALL(std::sort, A_j_sorted, A_j_sorted + nnz_A); - reduced_col_indices = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); - reduced_col_nnz = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); +#ifdef HYPRE_USING_SYCL + HYPRE_ONEDPL_CALL(std::sort, A_j_sorted, A_j_sorted + nnz_A); - // ABB: Replace values in-place with dpct::make_constant_iterator(1) HYPRE_Int* values = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_UNIFIED); hypre_HandleComputeStream(hypre_handle())->fill(values, 1, nnz_A*sizeof(HYPRE_Int)).wait(); std::pair new_end = @@ -2109,263 +808,534 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A, reduced_col_indices, reduced_col_nnz ); + hypre_TFree(values, HYPRE_MEMORY_UNIFIED); +#else + HYPRE_THRUST_CALL(sort, A_j_sorted, A_j_sorted + nnz_A); + + thrust::pair new_end = + HYPRE_THRUST_CALL(reduce_by_key, A_j_sorted, A_j_sorted + nnz_A, + thrust::make_constant_iterator(1), + reduced_col_indices, + reduced_col_nnz); +#endif + hypre_assert(new_end.first - reduced_col_indices == new_end.second - reduced_col_nnz); num_reduced_col_indices = new_end.first - reduced_col_indices; hypre_Memset(colnnz, 0, ncols_A * sizeof(HYPRE_Real), HYPRE_MEMORY_DEVICE); +#ifdef HYPRE_USING_SYCL HYPRE_ONEDPL_CALL( oneapi::dpl::copy, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices, oneapi::dpl::make_permutation_iterator(colnnz, reduced_col_indices) ); +#else + HYPRE_THRUST_CALL(scatter, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices, + reduced_col_indices, colnnz); +#endif hypre_TFree(A_j_sorted, HYPRE_MEMORY_DEVICE); hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE); hypre_TFree(reduced_col_nnz, HYPRE_MEMORY_DEVICE); - hypre_TFree(values, HYPRE_MEMORY_UNIFIED); hypre_SyncDeviceComputeStream(hypre_handle()); return hypre_error_flag; } -HYPRE_Int -hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A) +__global__ void +hypreGPUKernel_CSRMoveDiagFirst( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, + #endif + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *aa ) { - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); - HYPRE_Int new_nnz; - HYPRE_Int *new_ii; - HYPRE_Int *new_j; - HYPRE_Complex *new_data; +#ifdef HYPRE_USING_SYCL + HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + sycl::sub_group SG = item.get_sub_group(); +#else + HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); +#endif - auto zipped_begin = oneapi::dpl::make_zip_iterator(A_ii, A_j); - new_nnz = HYPRE_ONEDPL_CALL( std::count_if, - zipped_begin, zipped_begin + nnz, - [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); + if (row >= nrows) + { + return; + } - if (new_nnz == nnz) + HYPRE_Int p = 0, q = 0; + + if (lane < 2) { - /* no diagonal entries found */ - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - return hypre_error_flag; + p = read_only_load(ia + row + lane); } +#ifdef HYPRE_USING_SYCL + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); - new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + for (HYPRE_Int j = p + lane + 1; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0)) +#else + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - if (A_data) + for (HYPRE_Int j = p + lane + 1; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) +#endif { - new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); + hypre_int find_diag = j < q && ja[j] == row; - auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data); - auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, - first, first + nnz, - oneapi::dpl::make_zip_iterator(A_ii, A_j), - oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data), - [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); + if (find_diag) + { + ja[j] = ja[p]; + ja[p] = row; + HYPRE_Complex tmp = aa[p]; + aa[p] = aa[j]; + aa[j] = tmp; + } - // todo: fix this - // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz ); +#ifdef HYPRE_USING_SYCL + if ( sycl::any_of_group(SG, find_diag) ) +#else + if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) +#endif + { + break; + } } - else +} + +/* check if diagonal entry is the first one at each row + * Return: the number of rows that do not have the first entry as diagonal + * RL: only check if it's a non-empty row + */ +__global__ void +hypreGPUKernel_CSRCheckDiagFirst( +#ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, +#endif + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Int *result ) +{ +#ifdef HYPRE_USING_SYCL + const HYPRE_Int row = hypre_gpu_get_grid_thread_id<1,1>(item); +#else + const HYPRE_Int row = hypre_cuda_get_grid_thread_id<1,1>(); +#endif + if (row < nrows) + { + result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row); + } +} + +__global__ void +hypreGPUKernel_CSRMatrixFixZeroDiagDevice( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, + #endif + HYPRE_Complex v, + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *data, + HYPRE_Real tol, + HYPRE_Int *result ) +{ +#ifdef HYPRE_USING_SYCL + const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + sycl::sub_group SG = item.get_sub_group(); +#else + const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); +#endif + + if (row >= nrows) { - new_data = NULL; + return; + } - auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j); - auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, - first, first + nnz, - first, - oneapi::dpl::make_zip_iterator(new_ii, new_j), - [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); + HYPRE_Int p = 0, q = 0; + bool has_diag = false; - // todo: fix this - // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz ); + if (lane < 2) + { + p = read_only_load(ia + row + lane); } - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); +#ifdef HYPRE_USING_SYCL + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); - hypre_CSRMatrixNumNonzeros(A) = new_nnz; - hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); - hypre_CSRMatrixJ(A) = new_j; - hypre_CSRMatrixData(A) = new_data; - hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); + for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0)) +#else + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - return hypre_error_flag; -} + for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) +#endif + { + hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; -/* return C = [A; B] */ -hypre_CSRMatrix* -hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B) -{ - hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) ); + if (find_diag) + { + if (fabs(data[j]) <= tol) + { + data[j] = v; + } + } - hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B), - hypre_CSRMatrixNumCols(A), - hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) ); +#ifdef HYPRE_USING_SYCL + if ( sycl::any_of_group(SG, find_diag) ) +#else + if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) +#endif + { + has_diag = true; + break; + } + } - HYPRE_Int *C_i = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE); - HYPRE_Int *C_j = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); - HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); + if (result && !has_diag && lane == 0) + { + result[row] = 1; + } +} - hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1, - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); +__global__ void +hypreGPUKernel_CSRMatrixReplaceDiagDevice( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, + #endif + HYPRE_Complex *new_diag, + HYPRE_Complex v, + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *data, + HYPRE_Real tol, + HYPRE_Int *result ) +{ +#ifdef HYPRE_USING_SYCL + const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + sycl::sub_group SG = item.get_sub_group(); +#else + const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); +#endif + if (row >= nrows) + { + return; + } - HYPRE_Int *const_iterator = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE); - hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, hypre_CSRMatrixNumNonzeros(A), (hypre_CSRMatrixNumRows(C) + 1)*sizeof(HYPRE_Int)).wait(); - HYPRE_ONEDPL_CALL( std::transform, - C_i + hypre_CSRMatrixNumRows(A) + 1, - C_i + hypre_CSRMatrixNumRows(C) + 1, - const_iterator, //dpct::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)), - C_i + hypre_CSRMatrixNumRows(A) + 1, - std::plus() ); - hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE); + HYPRE_Int p = 0, q = 0; + bool has_diag = false; + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } +#ifdef HYPRE_USING_SYCL + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); - hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0)) +#else + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) +#endif + { + hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; - hypre_CSRMatrixI(C) = C_i; - hypre_CSRMatrixJ(C) = C_j; - hypre_CSRMatrixData(C) = C_a; - hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; + if (find_diag) + { + HYPRE_Complex d = read_only_load(&new_diag[row]); + if (fabs(d) <= tol) + { + d = v; + } + data[j] = d; + } - return C; +#ifdef HYPRE_USING_SYCL + if ( sycl::any_of_group(SG, find_diag) ) +#else + if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) +#endif + { + has_diag = true; + break; + } + } + + if (result && !has_diag && lane == 0) + { + result[row] = 1; + } } -/* A = alp * I */ -hypre_CSRMatrix * -hypre_CSRMatrixIdentityDevice(HYPRE_Int n, HYPRE_Complex alp) +/* type == 0, sum, + * 1, abs sum (l-1) + * 2, square sum (l-2) + */ +template +__global__ void +hypreGPUKernel_CSRRowSum( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, + #endif + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *aa, + HYPRE_Int *CF_i, + HYPRE_Int *CF_j, + HYPRE_Complex *row_sum, + HYPRE_Complex scal, + HYPRE_Int set) { - hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n); +#ifdef HYPRE_USING_SYCL + HYPRE_Int row_i = hypre_gpu_get_grid_warp_id<1,1>(item); + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + sycl::sub_group SG = item.get_sub_group(); +#else + HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); +#endif + if (row_i >= nrows) + { + return; + } - hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE); + HYPRE_Int p = 0, q = 0; - HYPRE_ONEDPL_CALL( dpct::iota, - hypre_CSRMatrixI(A), - hypre_CSRMatrixI(A) + n + 1, - 0 ); + if (lane < 2) + { + p = read_only_load(ia + row_i + lane); + } - HYPRE_ONEDPL_CALL( dpct::iota, - hypre_CSRMatrixJ(A), - hypre_CSRMatrixJ(A) + n, - 0 ); + HYPRE_Complex row_sum_i = 0.0; - HYPRE_ONEDPL_CALL( std::fill, - hypre_CSRMatrixData(A), - hypre_CSRMatrixData(A) + n, - alp ); +#ifdef HYPRE_USING_SYCL + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); - return A; -} + for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0)) +#else + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); -/* this predicate compares first and second element in a tuple in absolute value */ -/* first is assumed to be complex, second to be real > 0 */ -struct cabsfirst_greaterthan_second_pred -{ - bool operator()(const std::tuple& t) const + for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) +#endif + { + if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) ) { - const HYPRE_Complex i = std::get<0>(t); - const HYPRE_Real j = std::get<1>(t); + continue; + } - return hypre_cabs(i) > j; + HYPRE_Complex aii = aa[j]; + + if (type == 0) + { + row_sum_i += aii; } -}; + else if (type == 1) + { + row_sum_i += fabs(aii); + } + else if (type == 2) + { + row_sum_i += aii * aii; + } + } +#ifdef HYPRE_USING_SYCL + row_sum_i = warp_reduce_sum(row_sum_i, item); +#else + row_sum_i = warp_reduce_sum(row_sum_i); +#endif + if (lane == 0) + { + if (set) + { + row_sum[row_i] = scal * row_sum_i; + } + else + { + row_sum[row_i] += scal * row_sum_i; + } + } +} -/* drop the entries that are smaller than: - * tol if elmt_tols == null, - * elmt_tols[j] otherwise where j = 0...NumNonzeros(A) */ -HYPRE_Int -hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A, - HYPRE_Real tol, - HYPRE_Real *elmt_tols) +/* type 0: diag + * 1: abs diag + * 2: diag inverse + * 3: diag inverse sqrt + * 4: abs diag inverse sqrt + */ +__global__ void +hypreGPUKernel_CSRExtractDiag( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, + #endif + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *aa, + HYPRE_Complex *d, + HYPRE_Int type) { - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_ii = NULL; - HYPRE_Int new_nnz = 0; - HYPRE_Int *new_ii; - HYPRE_Int *new_j; - HYPRE_Complex *new_data; +#ifdef HYPRE_USING_SYCL + HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + sycl::sub_group SG = item.get_sub_group(); +#else + HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); +#endif + if (row >= nrows) + { + return; + } - if (elmt_tols == NULL) + HYPRE_Int p = 0, q = 0; + + if (lane < 2) { - new_nnz = HYPRE_ONEDPL_CALL( std::count_if, - A_data, - A_data + nnz, - std::not_fn(less_than(tol)) ); + p = read_only_load(ia + row + lane); } - else + HYPRE_Int has_diag = 0; +#ifdef HYPRE_USING_SYCL + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); + + for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0)) +#else + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); + + for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) +#endif { - auto first = oneapi::dpl::make_zip_iterator(A_data, elmt_tols); - new_nnz = HYPRE_ONEDPL_CALL( std::count_if, - first, - first + nnz, - cabsfirst_greaterthan_second_pred() ); + hypre_int find_diag = j < q && ja[j] == row; + + if (find_diag) + { + if (type == 0) + { + d[row] = aa[j]; + } + else if (type == 1) + { + d[row] = fabs(aa[j]); + } + else if (type == 2) + { + d[row] = 1.0 / aa[j]; + } + else if (type == 3) + { + d[row] = 1.0 / sqrt(aa[j]); + } + else if (type == 4) + { + d[row] = 1.0 / sqrt(fabs(aa[j])); + } + } + +#ifdef HYPRE_USING_SYCL + if ( sycl::any_of_group(SG, find_diag) ) +#else + if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) +#endif + { + has_diag = 1; + break; + } } - if (new_nnz == nnz) + if (!has_diag && lane == 0) { - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - return hypre_error_flag; + d[row] = 0.0; } +} - if (!A_ii) +/* mark is of size nA + * diag_option: 1: special treatment for diag entries, mark as -2 + */ +__global__ void +hypreGPUKernel_CSRMatrixIntersectPattern( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, + #endif + HYPRE_Int n, + HYPRE_Int nA, + HYPRE_Int *rowid, + HYPRE_Int *colid, + HYPRE_Int *idx, + HYPRE_Int *mark, + HYPRE_Int diag_option) +{ +#ifdef HYPRE_USING_SYCL + HYPRE_Int i = hypre_gpu_get_grid_thread_id<1,1>(item); +#else + HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>(); +#endif + + if (i >= n) { - A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); + return; } - new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); - oneapi::dpl::zip_iterator< HYPRE_Int*, HYPRE_Int*, HYPRE_Complex* > new_end; + HYPRE_Int r1 = read_only_load(&rowid[i]); + HYPRE_Int c1 = read_only_load(&colid[i]); + HYPRE_Int j = read_only_load(&idx[i]); - if (elmt_tols == NULL) + if (0 == diag_option) { - auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data); - new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, - first, first + nnz, - A_data, - oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data), - std::not_fn(less_than(tol)) ); + if (j < nA) + { + HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; + HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; + if (r1 == r2 && c1 == c2) + { + mark[j] = c1; + } + else + { + mark[j] = -1; + } + } } - else + else if (1 == diag_option) { - auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data); - new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, - first, first + nnz, - oneapi::dpl::make_zip_iterator(A_data, elmt_tols), - oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data), - cabsfirst_greaterthan_second_pred() ); + if (j < nA) + { + if (r1 == c1) + { + mark[j] = -2; + } + else + { + HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; + HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; + if (r1 == r2 && c1 == c2) + { + mark[j] = c1; + } + else + { + mark[j] = -1; + } + } + } } - - // todo: fix this - // hypre_assert( thrust::get<0>(*new_end) == new_ii + new_nnz ); - return hypre_error_flag; } -#endif /* HYPRE_USING_SYCL */ - - -#if defined(HYPRE_USING_GPU) - /* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v * Does NOT assume diagonal is the first entry of each row of A * In debug mode: @@ -2432,33 +1402,133 @@ hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); -#if HYPRE_DEBUG - HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); -#else - HYPRE_Int *result = NULL; +#if HYPRE_DEBUG + HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); +#else + HYPRE_Int *result = NULL; +#endif + + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim, + new_diag, v, hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), + tol, result ); + +#if HYPRE_DEBUG +#if defined(HYPRE_USING_CUDA) + ierr = HYPRE_THRUST_CALL( reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#elif defined(HYPRE_USING_SYCL) + ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#endif + hypre_TFree(result, HYPRE_MEMORY_DEVICE); +#endif // HYPRE_DEBUG + + hypre_SyncDeviceComputeStream(hypre_handle()); + + return ierr; +} + +HYPRE_Int +hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A) +{ + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); + HYPRE_Int new_nnz; + HYPRE_Int *new_ii; + HYPRE_Int *new_j; + HYPRE_Complex *new_data; + +#ifdef HYPRE_USING_SYCL + auto zipped_begin = oneapi::dpl::make_zip_iterator(A_ii, A_j); + new_nnz = HYPRE_ONEDPL_CALL( std::count_if, + zipped_begin, zipped_begin + nnz, + [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); +#else + new_nnz = HYPRE_THRUST_CALL( count_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz, + Int2Unequal() ); +#endif + + if (new_nnz == nnz) + { + /* no diagonal entries found */ + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + return hypre_error_flag; + } + + new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + + if (A_data) + { + new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); + +#ifdef HYPRE_USING_SYCL + auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, first + nnz, + oneapi::dpl::make_zip_iterator(A_ii, A_j), + oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data), + [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); + // todo: fix this + // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz ); +#else + thrust::zip_iterator< thrust::tuple > new_end; + new_end = HYPRE_THRUST_CALL( copy_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), + thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), + Int2Unequal() ); + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); +#endif + } + else + { + new_data = NULL; +#ifdef HYPRE_USING_SYCL + auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, first + nnz, + first, + oneapi::dpl::make_zip_iterator(new_ii, new_j), + [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); + // TODO: abb fix this + // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz ); +#else + thrust::zip_iterator< thrust::tuple > new_end; + new_end = HYPRE_THRUST_CALL( copy_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), + thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j)), + Int2Unequal() ); + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); #endif + } - HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim, - new_diag, v, hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), - tol, result ); - -#if HYPRE_DEBUG -#if defined(HYPRE_USING_CUDA) - ierr = HYPRE_THRUST_CALL( reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); -#elif defined(HYPRE_USING_SYCL) - ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); -#endif - hypre_TFree(result, HYPRE_MEMORY_DEVICE); -#endif // HYPRE_DEBUG + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); - hypre_SyncDeviceComputeStream(hypre_handle()); + hypre_CSRMatrixNumNonzeros(A) = new_nnz; + hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); + hypre_CSRMatrixJ(A) = new_j; + hypre_CSRMatrixData(A) = new_data; + hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); - return ierr; + return hypre_error_flag; } HYPRE_Int @@ -2583,6 +1653,232 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A, hypre_SyncDeviceComputeStream(hypre_handle()); } +/* return C = [A; B] */ +hypre_CSRMatrix* +hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B) +{ + hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) ); + + hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B), + hypre_CSRMatrixNumCols(A), + hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) ); + + HYPRE_Int *C_i = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE); + HYPRE_Int *C_j = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); + HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); + + hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1, + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + +#ifdef HYPRE_USING_SYCL + HYPRE_Int *const_iterator = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE); + hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, hypre_CSRMatrixNumNonzeros(A), (hypre_CSRMatrixNumRows(C) + 1)*sizeof(HYPRE_Int)).wait(); + + HYPRE_ONEDPL_CALL( std::transform, + C_i + hypre_CSRMatrixNumRows(A) + 1, + C_i + hypre_CSRMatrixNumRows(C) + 1, + const_iterator, //dpct::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)), + C_i + hypre_CSRMatrixNumRows(A) + 1, + std::plus() ); + + hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE); +#else + HYPRE_THRUST_CALL( transform, + C_i + hypre_CSRMatrixNumRows(A) + 1, + C_i + hypre_CSRMatrixNumRows(C) + 1, + thrust::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)), + C_i + hypre_CSRMatrixNumRows(A) + 1, + thrust::plus() ); +#endif + + hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + hypre_CSRMatrixI(C) = C_i; + hypre_CSRMatrixJ(C) = C_j; + hypre_CSRMatrixData(C) = C_a; + hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; + + return C; +} + +/* A = alp * I */ +hypre_CSRMatrix * +hypre_CSRMatrixIdentityDevice(HYPRE_Int n, HYPRE_Complex alp) +{ + hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n); + + hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE); + +#ifdef HYPRE_USING_SYCL + HYPRE_ONEDPL_CALL( dpct::iota, + hypre_CSRMatrixI(A), + hypre_CSRMatrixI(A) + n + 1, + 0 ); + + HYPRE_ONEDPL_CALL( dpct::iota, + hypre_CSRMatrixJ(A), + hypre_CSRMatrixJ(A) + n, + 0 ); + + HYPRE_ONEDPL_CALL( std::fill, + hypre_CSRMatrixData(A), + hypre_CSRMatrixData(A) + n, + alp ); +#else + HYPRE_THRUST_CALL( sequence, + hypre_CSRMatrixI(A), + hypre_CSRMatrixI(A) + n + 1, + 0 ); + + HYPRE_THRUST_CALL( sequence, + hypre_CSRMatrixJ(A), + hypre_CSRMatrixJ(A) + n, + 0 ); + + HYPRE_THRUST_CALL( fill, + hypre_CSRMatrixData(A), + hypre_CSRMatrixData(A) + n, + alp ); +#endif + return A; +} + + +/* drop the entries that are smaller than: + * tol if elmt_tols == null, + * elmt_tols[j] otherwise where j = 0...NumNonzeros(A) */ +HYPRE_Int +hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A, + HYPRE_Real tol, + HYPRE_Real *elmt_tols) +{ + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_ii = NULL; + HYPRE_Int new_nnz = 0; + HYPRE_Int *new_ii; + HYPRE_Int *new_j; + HYPRE_Complex *new_data; + + if (elmt_tols == NULL) + { +#ifdef HYPRE_USING_SYCL + new_nnz = HYPRE_ONEDPL_CALL( std::count_if, + A_data, + A_data + nnz, + std::not_fn(less_than(tol)) ); +#else + new_nnz = HYPRE_THRUST_CALL( count_if, + A_data, + A_data + nnz, + thrust::not1(less_than(tol)) ); +#endif + } + else + { +#ifdef HYPRE_USING_SYCL + auto first = oneapi::dpl::make_zip_iterator(A_data, elmt_tols); + new_nnz = HYPRE_ONEDPL_CALL( std::count_if, + first, + first + nnz, + cabsfirst_greaterthan_second_pred() ); +#else + new_nnz = HYPRE_THRUST_CALL( count_if, + thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)), + thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)) + nnz, + cabsfirst_greaterthan_second_pred() ); +#endif + } + + if (new_nnz == nnz) + { + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + return hypre_error_flag; + } + + if (!A_ii) + { + A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); + } + new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); + +#ifdef HYPRE_USING_SYCL + oneapi::dpl::zip_iterator< HYPRE_Int*, HYPRE_Int*, HYPRE_Complex* > new_end; + auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data); + + if (elmt_tols == NULL) + { + new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, first + nnz, + A_data, + oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data), + std::not_fn(less_than(tol)) ); + } + else + { + new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, first + nnz, + oneapi::dpl::make_zip_iterator(A_data, elmt_tols), + oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data), + cabsfirst_greaterthan_second_pred() ); + } + + // TODO: abb fix this + // hypre_assert( thrust::get<0>(*new_end) == new_ii + new_nnz ); +#else + thrust::zip_iterator< thrust::tuple > new_end; + + if (elmt_tols == NULL) + { + new_end = HYPRE_THRUST_CALL( copy_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, + A_data, + thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), + thrust::not1(less_than(tol)) ); + } + else + { + new_end = HYPRE_THRUST_CALL( copy_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, + thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)), + thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), + cabsfirst_greaterthan_second_pred() ); + } + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); +#endif + + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); + + hypre_CSRMatrixNumNonzeros(A) = new_nnz; + hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); + hypre_CSRMatrixJ(A) = new_j; + hypre_CSRMatrixData(A) = new_data; + hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); + + return hypre_error_flag; +} + void hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, HYPRE_Int *CF_i, @@ -2619,7 +1915,6 @@ hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, hypre_SyncDeviceComputeStream(hypre_handle()); } - HYPRE_Int hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix *A, hypre_CSRMatrix **AT_ptr, From bcf0e579e0e3552d151151e49d52594bdf85e62e Mon Sep 17 00:00:00 2001 From: Abhishek Bagusetty Date: Thu, 23 Dec 2021 15:25:07 +0000 Subject: [PATCH 44/44] fix complex data types preprocessor for CUDA, HIP --- src/utilities/HYPRE_utilities.h | 2 +- src/utilities/complex.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/utilities/HYPRE_utilities.h b/src/utilities/HYPRE_utilities.h index f8bbb154f8..6ac7ccd255 100644 --- a/src/utilities/HYPRE_utilities.h +++ b/src/utilities/HYPRE_utilities.h @@ -86,7 +86,7 @@ typedef double HYPRE_Real; #if defined(HYPRE_USING_SYCL) typedef std::complex HYPRE_Complex; -#elif defined(HYPRE_USING_GPU) +#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) typedef thrust::complex HYPRE_Complex; #else typedef double _Complex HYPRE_Complex; diff --git a/src/utilities/complex.c b/src/utilities/complex.c index ba04d01577..59b71bbf56 100644 --- a/src/utilities/complex.c +++ b/src/utilities/complex.c @@ -14,7 +14,7 @@ hypre_conj( HYPRE_Complex value ) { #ifdef HYPRE_USING_SYCL return std::conj(value); -#elif defined(HYPRE_USING_GPU) +#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) return thrust::conj(value); #else return conj(value); @@ -26,7 +26,7 @@ hypre_cabs( HYPRE_Complex value ) { #ifdef HYPRE_USING_SYCL return std::abs(value); -#elif defined(HYPRE_USING_GPU) +#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) return thrust::abs(value); #else return cabs(value); @@ -38,7 +38,7 @@ hypre_creal( HYPRE_Complex value ) { #ifdef HYPRE_USING_SYCL return std::real(value); -#elif defined(HYPRE_USING_GPU) +#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) return thrust::real(value); #else return creal(value); @@ -50,7 +50,7 @@ hypre_cimag( HYPRE_Complex value ) { #ifdef HYPRE_USING_SYCL return std::imag(value); -#elif defined(HYPRE_USING_GPU) +#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) return thrust::imag(value); #else return cimag(value);